diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 9c18c03d7bdb8406d43aa8fc4efdb8a206b1217e..b3c669ec1097745bd30f650ca0b9dacda647c61d 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -65,7 +65,7 @@ RUN locale-gen "en_US.UTF-8" RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev RUN git clone https://github.com/verilator/verilator RUN cd verilator && \ - git checkout v4.012 && \ + git checkout v4.224 && \ autoconf && \ ./configure && \ make -j4 && \ diff --git a/fetch-repos.sh b/fetch-repos.sh index 10b6b332550be5d914d80e242f01e77daeaf08a0..b0f6400ed142b203b1c9f6d7ea4ac6ababcf34d1 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,12 +27,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="92184fea2dd417bc7a53c82811fef271e4833c4c" +QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6" FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366" BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" -PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2" +PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="e7f2de91d1a2ddadaaea06b8f4c20e97a575470e" +HLSLIB_COMMIT="d27f6b6c5d8f1bb208db395659389603f63ad4be" OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v index b4e89628a44bb1f55c3445ee8e6866beada23585..11cef604e0a3d106529a65ae229bc4cb419c4d70 100644 --- a/finn-rtllib/memstream/hdl/Q_srl.v +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -69,7 +69,7 @@ `define Q_srl -module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount); parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) parameter width = 16; // - width of data (i_d, o_d) @@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); wire o_b; // - output stream back-pressure output [addrwidth:0] count; // - output number of elems in queue + output [addrwidth:0] maxcount; // - maximum observed count since reset + reg [addrwidth:0] maxcount_reg; // - maximum count seen until now reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address // for data output reg shift_en_; // - SRL16 shift enable @@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); assign o_d = srlo; // - output data from queue assign o_v = o_v_reg; // - output valid if non-empty assign i_b = i_b_reg; // - input bp if full + assign maxcount = maxcount_reg; assign i_r = !i_b; assign o_b = !o_r; @@ -139,7 +142,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr <= 0; addr_full <= 0; o_v_reg <= 0; - i_b_reg <= 1; + + i_b_reg <= 0; + maxcount_reg <= 0; + end else begin state <= state_; @@ -147,6 +153,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr_full <= addr_full_; o_v_reg <= o_v_reg_; i_b_reg <= i_b_reg_; + maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg); end end // always @ (posedge clock) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index e16711f63b954707bc7ad9050dd7627ca1ce99c1..d3c4156d9b4ccf601d3eea348f6cb61c0d9a6e87 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -37,6 +37,13 @@ from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map +class AutoFIFOSizingMethod(str, Enum): + "Select the type of automatic FIFO sizing strategy." + + CHARACTERIZE = "characterize" + LARGEFIFO_RTLSIM = "largefifo_rtlsim" + + class ShellFlowType(str, Enum): """For builds that produce a bitfile, select the shell flow that will integrate the FINN-generated accelerator.""" @@ -246,6 +253,12 @@ class DataflowBuildConfig: #: for each FIFO. auto_fifo_depths: Optional[bool] = True + #: When `auto_fifo_depths = True`, select which method will be used for + #: setting the FIFO sizes. + auto_fifo_strategy: Optional[ + AutoFIFOSizingMethod + ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM + #: Memory resource type for large FIFOs #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO @@ -320,6 +333,10 @@ class DataflowBuildConfig: #: Override the number of inputs for rtlsim performance measurement. rtlsim_batch_size: Optional[int] = 1 + #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during + #: rtlsim, otherwise they will be replaced by HLS implementations. + rtlsim_use_vivado_comps: Optional[bool] = True + def _resolve_hls_clk_period(self): if self.hls_clk_period_ns is None: # use same clk for synth and hls if not explicitly specified diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index e77f17d7c27f4be08aa6725e5803a1ea566c9443..5da608c27def8136f9ad11f62b4707452eac3120 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -29,6 +29,7 @@ import json import numpy as np import os +import shutil from copy import deepcopy from distutils.dir_util import copy_tree from qonnx.core.modelwrapper import ModelWrapper @@ -78,6 +79,10 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( CreateDataflowPartition, ) from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.derive_characteristic import ( + DeriveCharacteristic, + DeriveFIFOSizes, +) from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO @@ -85,6 +90,7 @@ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) @@ -121,81 +127,126 @@ def verify_step( verify_out_dir = cfg.output_dir + "/verification_output" intermediate_models_dir = cfg.output_dir + "/intermediate_models" os.makedirs(verify_out_dir, exist_ok=True) - (in_npy, exp_out_npy) = cfg._resolve_verification_io_pair() - if need_parent: - assert ( - cfg.save_intermediate_models - ), "Enable save_intermediate_models for verification" - parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" - child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name - model.save(child_model_fn) - out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name - out_dict = execute_parent( - parent_model_fn, child_model_fn, in_npy, return_full_ctx=True - ) - out_npy = out_dict[out_tensor_name] - else: - inp_tensor_name = model.graph.input[0].name - out_tensor_name = model.graph.output[0].name - inp_dict = {inp_tensor_name: in_npy} - if rtlsim_pre_hook is not None: - out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook) + (in_npy_all, exp_out_npy_all) = cfg._resolve_verification_io_pair() + bsize_in = in_npy_all.shape[0] + bsize_out = exp_out_npy_all.shape[0] + assert bsize_in == bsize_out, "Batch sizes don't match for verification IO pair" + all_res = True + for b in range(bsize_in): + in_npy = np.expand_dims(in_npy_all[b], axis=0) + exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0) + if need_parent: + assert ( + cfg.save_intermediate_models + ), "Enable save_intermediate_models for verification" + parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" + child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name + model.save(child_model_fn) + parent_model = ModelWrapper(parent_model_fn) + out_tensor_name = parent_model.graph.output[0].name + exp_ishape = parent_model.get_tensor_shape(parent_model.graph.input[0].name) + if in_npy.shape != exp_ishape: + print( + "Verification input has shape %s while model expects %s" + % (str(in_npy.shape), str(exp_ishape)) + ) + print("Attempting to force model shape on verification input") + in_npy = in_npy.reshape(exp_ishape) + out_dict = execute_parent( + parent_model_fn, child_model_fn, in_npy, return_full_ctx=True + ) + out_npy = out_dict[out_tensor_name] else: - out_dict = execute_onnx(model, inp_dict, True) - out_npy = out_dict[out_tensor_name] - res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all() - res_to_str = {True: "SUCCESS", False: "FAIL"} - res_str = res_to_str[res] - if cfg.verify_save_full_context: - verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % ( - step_name, - res_str, - ) - np.savez(verification_output_fn, **out_dict) - else: - verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % ( - step_name, - res_str, - ) - np.save(verification_output_fn, out_npy) - print("Verification for %s : %s" % (step_name, res_str)) + inp_tensor_name = model.graph.input[0].name + out_tensor_name = model.graph.output[0].name + exp_ishape = model.get_tensor_shape(inp_tensor_name) + if in_npy.shape != exp_ishape: + print( + "Verification input has shape %s while model expects %s" + % (str(in_npy.shape), str(exp_ishape)) + ) + print("Attempting to force model shape on verification input") + in_npy = in_npy.reshape(exp_ishape) + inp_dict = {inp_tensor_name: in_npy} + if rtlsim_pre_hook is not None: + out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook) + else: + out_dict = execute_onnx(model, inp_dict, True) + out_npy = out_dict[out_tensor_name] + exp_oshape = exp_out_npy.shape + if out_npy.shape != exp_oshape: + print( + "Verification output has shape %s while model produces %s" + % (str(exp_oshape), str(out_npy.shape)) + ) + print("Attempting to force model shape on verification output") + out_npy = out_npy.reshape(exp_oshape) + + res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all() + all_res = all_res and res + res_to_str = {True: "SUCCESS", False: "FAIL"} + res_str = res_to_str[res] + if cfg.verify_save_full_context: + verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npz" % ( + step_name, + b, + res_str, + ) + np.savez(verification_output_fn, **out_dict) + else: + verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npy" % ( + step_name, + b, + res_str, + ) + np.save(verification_output_fn, out_npy) + if cfg.verify_save_rtlsim_waveforms: + vcd_path = model.get_metadata_prop("rtlsim_trace") + if vcd_path is not None and os.path.isfile(vcd_path): + new_vcd_path = vcd_path.replace(".vcd", "_%d.vcd" % b) + shutil.move(vcd_path, new_vcd_path) + print("Verification for %s : %s" % (step_name, res_to_str[all_res])) def prepare_for_stitched_ip_rtlsim(verify_model, cfg): - need_restitch = False - # rtlsim only supports certain impl_style for some nodes - # StreamingFIFO must have impl_style=rtl - for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): - inst = getCustomOp(fifo_layer) - if inst.get_nodeattr("impl_style") != "rtl": - inst.set_nodeattr("impl_style", "rtl") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True - # StreamingDataWidthConverter must have impl_style=hls - for dwc_layer in verify_model.get_nodes_by_op_type( - "StreamingDataWidthConverter_Batch" - ): - inst = getCustomOp(dwc_layer) - if inst.get_nodeattr("impl_style") != "hls": - inst.set_nodeattr("impl_style", "hls") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True - # if we've made alterations to the model, need to do some re-prep - if need_restitch: - print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") - verify_model = verify_model.transform( - PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) - ) - verify_model = verify_model.transform(HLSSynthIP()) - verify_model = verify_model.transform( - CreateStitchedIP( - cfg._resolve_fpga_part(), - cfg.synth_clk_period_ns, - vitis=False, + if not cfg.rtlsim_use_vivado_comps: + need_restitch = False + # switch impl_style=vivado components to rtl/hls + # StreamingFIFO must have impl_style=rtl + for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): + inst = getCustomOp(fifo_layer) + if inst.get_nodeattr("impl_style") != "rtl": + inst.set_nodeattr("impl_style", "rtl") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # StreamingDataWidthConverter must have impl_style=hls + for dwc_layer in verify_model.get_nodes_by_op_type( + "StreamingDataWidthConverter_Batch" + ): + inst = getCustomOp(dwc_layer) + if inst.get_nodeattr("impl_style") != "hls": + inst.set_nodeattr("impl_style", "hls") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # if we've made alterations to the model, need to do some re-prep + if need_restitch: + print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") + verify_model = verify_model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) ) - ) + verify_model = verify_model.transform(HLSSynthIP()) + verify_model = verify_model.transform( + CreateStitchedIP( + cfg._resolve_fpga_part(), + cfg.synth_clk_period_ns, + vitis=False, + ) + ) + else: + print("rtlsim_use_vivado_comps is enabled, may yield incorrect results") + # set top-level prop for stitched-ip rtlsim and launch verify_model.set_metadata_prop("exec_mode", "rtlsim") # TODO make configurable @@ -449,9 +500,9 @@ def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): """ Depending on the auto_fifo_depths setting, do one of the following: - * if auto_fifo_depths=True: Run the `InsertAndSetFIFODepths` transformation - to attempt to determine the FIFO sizes that provide full throughput. Involves - running stitched-IP rtlsim and may take a long time. + * if auto_fifo_depths=True: Run the appropriate auto-sizing transformation + to attempt to determine the FIFO sizes that provide full throughput. + May take a long time. * if auto_fifo_depths=False: Assume the folding config file contains FIFO sizes as well. Runs the `InsertFIFO` transformation, then `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`. @@ -460,13 +511,35 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): """ if cfg.auto_fifo_depths: - model = model.transform( - InsertAndSetFIFODepths( - cfg._resolve_fpga_part(), - cfg._resolve_hls_clk_period(), - vivado_ram_style=cfg.large_fifo_mem_style, + if cfg.auto_fifo_strategy == "characterize": + model = model.transform(InsertDWC()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) ) - ) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + model = model.transform(AnnotateCycles()) + period = model.analysis(dataflow_performance)["max_cycles"] + 10 + model = model.transform(DeriveCharacteristic(period)) + model = model.transform(DeriveFIFOSizes()) + model = model.transform( + InsertFIFO( + vivado_ram_style=cfg.large_fifo_mem_style, max_qsrl_depth=256 + ) + ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + elif cfg.auto_fifo_strategy == "largefifo_rtlsim": + model = model.transform( + InsertAndSetFIFODepths( + cfg._resolve_fpga_part(), + cfg._resolve_hls_clk_period(), + vivado_ram_style=cfg.large_fifo_mem_style, + ) + ) + else: + assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy else: # assume folding cfg json contains FIFO sizes too # insert DWCs, FIFOs and run ApplyConfig once more diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py index 13a4c5892c8f82c37e1794057a06217981a6a580..cd0af6b3ab3d8250abbf7d48e004622e55f09f04 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py @@ -42,18 +42,21 @@ class AddStreams_Batch(HLSCustomOp): super().__init__(onnx_node) def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) return my_attrs def get_normal_input_shape(self, ind=0): @@ -70,10 +73,10 @@ class AddStreams_Batch(HLSCustomOp): ishape = tuple(vecs + [ich // pe, pe]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() def make_shape_compatible_op(self, model): @@ -124,11 +127,11 @@ class AddStreams_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # we need to set output datatype to the next larger int or uint # enhancement: consider specifying w/ explicit outputDataType attribute @@ -139,14 +142,14 @@ class AddStreams_Batch(HLSCustomOp): else: return DataType.get_smallest_possible(2 * idt.max()) - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") @@ -357,3 +360,14 @@ class AddStreams_Batch(HLSCustomOp): swidth = self.get_instream_width_padded() intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + "in1": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py index 3ed76db2982e411b711be5bd78e39dd866332714..46adca680d3c96695eeb5a91be53ea158fc78f1f 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -102,9 +102,6 @@ class ChannelwiseOp_Batch(HLSCustomOp): "inputDataType": ("s", True, ""), "paramDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - # input and output FIFO depths - "inFIFODepth": ("i", False, 0), - "outFIFODepth": ("i", False, 0), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -221,23 +218,23 @@ class ChannelwiseOp_Batch(HLSCustomOp): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") fold = ich // pe @@ -245,17 +242,17 @@ class ChannelwiseOp_Batch(HLSCustomOp): folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # same shape as input return self.get_folded_input_shape() - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [ich]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py index bde285eb0dd1b3818926c1feb7ac8d5de69a4be6..c927c07df21faf40ccbf9ddbe47e3f2f2ca61c89 100644 --- a/src/finn/custom_op/fpgadataflow/checksum.py +++ b/src/finn/custom_op/fpgadataflow/checksum.py @@ -77,31 +77,31 @@ class CheckSum(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # here same as input data type return DataType[self.get_nodeattr("inputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): dtype = DataType[self.get_nodeattr("inputDataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): return self.get_instream_width() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): # derive normal shape from folded shape # checksum nodes are inserted in between fpgadataflow nodes # the folded shape could be for example (1, nf, pe) @@ -127,7 +127,7 @@ class CheckSum(HLSCustomOp): def get_ap_int_max_w(self): return max(super().get_ap_int_max_w(), 32) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py index 5fcf9cf96cbacd4e444af0b90618a19eefb9bfe2..4437bcd1984c5194b0a19b43d692babb7e3cd158 100644 --- a/src/finn/custom_op/fpgadataflow/concat.py +++ b/src/finn/custom_op/fpgadataflow/concat.py @@ -74,12 +74,12 @@ class StreamingConcat(HLSCustomOp): def get_folded_input_shape(self, ind=0): return self.get_normal_input_shape(ind) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): total_elems = self.get_total_elems() vecs = list(self.get_nodeattr("numInputVectors")) return tuple(vecs + [total_elems]) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_normal_output_shape() def make_shape_compatible_op(self, model): @@ -106,7 +106,7 @@ class StreamingConcat(HLSCustomOp): # input dt identical for all inputs return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): return self.get_input_datatype() def get_instream_width(self, ind=0): @@ -115,7 +115,7 @@ class StreamingConcat(HLSCustomOp): ibits = self.get_input_datatype().bitwidth() return elems * ibits - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() total_elems = self.get_total_elems() out_width = total_elems * obits diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 251a9882c58a3cf94449701795b72c8a6adab318..1566445999a2c568b5c5a112d436bf05fd89aca5 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -99,13 +99,13 @@ class ConvolutionInputGenerator(HLSCustomOp): assert ret[0] == ret[1] == 1, "Only dilation=1 supported" return ret - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -114,7 +114,7 @@ class ConvolutionInputGenerator(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -126,7 +126,7 @@ class ConvolutionInputGenerator(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -158,15 +158,15 @@ class ConvolutionInputGenerator(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns stream width, input and output stream width are equal for the sliding window function""" ibits = self.get_input_datatype().bitwidth() @@ -176,7 +176,7 @@ class ConvolutionInputGenerator(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns stream width, input and output stream width are equal for the sliding window function, so the function to determine the input stream width can be reused.""" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py index aba74baecc0f40571fa288459a04ad42e167ccf6..f1c84662cc06e89df5bd7c0762ac47b8c5723502 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py @@ -91,13 +91,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -106,7 +106,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -118,7 +118,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -153,15 +153,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -169,7 +169,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.use_parallel_window_output(): # feed all window pixels in parallel k_h, k_w = self.get_nodeattr("ConvKernelDim") diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py index 665325bdee56d7de5936fb544f744c0341358387..49e2621ecd9cf1a7182c2bb0f5d644e763ae18f6 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -101,13 +101,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -116,7 +116,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -128,7 +128,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -163,15 +163,15 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -179,7 +179,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.get_nodeattr("parallel_window"): # feed all window pixels in parallel k_h, k_w = self.get_nodeattr("ConvKernelDim") diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index e9009e1856a2b379911969a69d258163e67c1197..b7efaff440dd5cc2160fbfb8050b30924460ffe6 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -79,7 +79,7 @@ class DownSampler(HLSCustomOp): exp_cycles = channels / simd * batch_size * idim_total return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): is_1D = self.get_nodeattr("is1D") is_1D_unitx = self.get_nodeattr("is1D_unitx") idim = self.get_nodeattr("ImgDim") @@ -94,7 +94,7 @@ class DownSampler(HLSCustomOp): ishape = (batch, idim, idim, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): is_1D = self.get_nodeattr("is1D") is_1D_unitx = self.get_nodeattr("is1D_unitx") odim = self.get_downsampled_odim() @@ -109,7 +109,7 @@ class DownSampler(HLSCustomOp): oshape = (batch, odim, odim, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -118,7 +118,7 @@ class DownSampler(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, simd] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -151,21 +151,21 @@ class DownSampler(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return ibits * simd - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return obits * simd diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index 04ca45e7f1c1844a9976d46392be46f6cffc2167..93cde15ca7d42dbed12417837916359fdcc71b67 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -61,13 +61,13 @@ class DuplicateStreams_Batch(HLSCustomOp): def get_num_output_streams(self): return self.get_nodeattr("NumOutputStreams") - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [ch]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -138,22 +138,22 @@ class DuplicateStreams_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("inputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") @@ -408,3 +408,13 @@ class DuplicateStreams_Batch(HLSCustomOp): ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) ) return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out0": [], "out1": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py index a29e871fabbc01f0accd6858d69c0a96a5a8c495..d6284750c73026c09fb7986ffc2517ed9ae3b153 100644 --- a/src/finn/custom_op/fpgadataflow/eltwise.py +++ b/src/finn/custom_op/fpgadataflow/eltwise.py @@ -42,21 +42,25 @@ class StreamingEltwise(HLSCustomOp): super().__init__(onnx_node) def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType0": ("s", True, ""), - "inputDataType1": ("s", True, ""), - # type of EltwiseFunction for the operation - "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType0": ("s", True, ""), + "inputDataType1": ("s", True, ""), + # type of EltwiseFunction for the operation + "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) return my_attrs def get_eltwise_op_lambda(self): @@ -91,10 +95,10 @@ class StreamingEltwise(HLSCustomOp): ishape = tuple(vecs + [ich // pe, pe]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() def make_shape_compatible_op(self, model): @@ -156,11 +160,11 @@ class StreamingEltwise(HLSCustomOp): return info_messages - def get_input_datatype(self, id=0): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType" + str(id))] + return DataType[self.get_nodeattr("inputDataType" + str(ind))] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" op = self.get_nodeattr("eltwiseOp") idt0 = self.get_input_datatype(0) @@ -196,7 +200,7 @@ class StreamingEltwise(HLSCustomOp): in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py index d69ea471ea8ae1d58f97d056936b505cc2a2806b..dfc55d283fa664e3b60fc7c4d5a056f53a119292 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -47,10 +47,6 @@ class FMPadding_Batch(HLSCustomOp): # spatial size of input images "ImgDim": ("ints", True, []), # [H, W] = [Y, X] # total padding (per dimension) to apply - # NOTE: Current padding scheme that is applied tries to pad the same - # amount of zeros in front and behind the image for each dimension. - # As an example, a padding scheme such as [1, x, 3, x] is equal - # to [2, x, 2, x] "Padding": ( "ints", True, @@ -62,10 +58,6 @@ class FMPadding_Batch(HLSCustomOp): "SIMD": ("i", False, 1), # FINN input datatype "inputDataType": ("s", True, ""), - # controls distribution of padded pixels - # in case of uneven padding -- see FMPadding fxn - # in hlslib - "PaddingStyle": ("i", False, 2, {2, 1}), # shape describing input vecs per execution "numInputVectors": ("i", False, 1), } @@ -90,20 +82,20 @@ class FMPadding_Batch(HLSCustomOp): exp_cycles = (channels / simd) * batch_size * odim_h * odim_w return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): idim_h, idim_w = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") ishape = (1, idim_h, idim_w, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): odim_h, odim_w = self.get_padded_odim() num_ch = self.get_nodeattr("NumChannels") oshape = (1, odim_h, odim_w, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -112,7 +104,7 @@ class FMPadding_Batch(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, simd] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -144,7 +136,7 @@ class FMPadding_Batch(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] # the hlslib op always pads with zeros, so ensure that the DataType @@ -152,16 +144,16 @@ class FMPadding_Batch(HLSCustomOp): assert ret.allowed(0), "FMPadding_Batch DataType must support zero" return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return ibits * simd - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return obits * simd @@ -179,23 +171,21 @@ class FMPadding_Batch(HLSCustomOp): pad = self.get_nodeattr("Padding") pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] - is_square = idim_h == idim_w + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w - if is_square: - assert ( - pad_h == pad_w - ), "Only equal padding along the dimensions for square images is supported" + if is_square_img and is_square_pad: self.code_gen_dict["$DEFINES$"] = [ """#define ImgDim1 {}\n#define OutputDim1 {}\n - #define Padding1 {}\n#define NumChannels1 {}\n - #define SIMD1 {}\n#define PaddingStyle1 {}\n + #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n + #define NumChannels1 {}\n#define SIMD1 {}\n #define numReps {}\n""".format( idim_h, odim_h, - pad_h, + pad[0], + pad[2], self.get_nodeattr("NumChannels"), self.get_nodeattr("SIMD"), - self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), ) ] @@ -204,20 +194,22 @@ class FMPadding_Batch(HLSCustomOp): """ #define OutputDim1_x {}\n #define OutputDim1_y {}\n - #define Padding1_x {}\n - #define Padding1_y {}\n + #define PaddingLeft1 {}\n + #define PaddingRight1 {}\n + #define PaddingTop1 {}\n + #define PaddingBottom1 {}\n #define NumChannels1 {}\n #define SIMD1 {}\n - #define PaddingStyle1 {}\n #define numReps {}\n """.format( odim_w, odim_h, - pad_w, - pad_h, + pad[1], + pad[3], + pad[0], + pad[2], self.get_nodeattr("NumChannels"), self.get_nodeattr("SIMD"), - self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), ) ] @@ -254,21 +246,26 @@ class FMPadding_Batch(HLSCustomOp): node = self.onnx_node idim_h, idim_w = self.get_nodeattr("ImgDim") - is_square = idim_h == idim_w + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w - if is_square: + if is_square_img and is_square_pad: hls_call = node.op_type self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, - {}, PaddingStyle1> (in0, out, numReps);""".format( + """{}<ImgDim1, OutputDim1, PaddingBefore1, PaddingBehind1, NumChannels1, SIMD1, + {}> (in0, out, numReps);""".format( hls_call, in_t ) ] else: hls_call = "FMPadding_nonsquare_Batch" self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1, - SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format( + """{}<OutputDim1_x, OutputDim1_y, PaddingLeft1, PaddingRight1, + PaddingTop1, PaddingBottom1, NumChannels1, + SIMD1, {}> (in0, out, numReps);""".format( hls_call, in_t ) ] diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py index adafa7dcf36111e63fa49e0d184594fff54be99d..e7fa5bc0048b54a32ebc61482b96009fa019809e 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py @@ -56,13 +56,13 @@ class GlobalAccPool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [ch]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -71,7 +71,7 @@ class GlobalAccPool_Batch(HLSCustomOp): folded_ishape = tuple(vecs + [folds, pe]) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) if len(vecs) == 1: @@ -80,7 +80,7 @@ class GlobalAccPool_Batch(HLSCustomOp): oshape = tuple([vecs[0]] + [1, 1, ch]) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") unfolded_shape = list(self.get_normal_output_shape()) @@ -139,11 +139,11 @@ class GlobalAccPool_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # determine data type from image size and input type idt = DataType[self.get_nodeattr("inputDataType")] @@ -155,14 +155,14 @@ class GlobalAccPool_Batch(HLSCustomOp): extreme_value = npixels * idt.max() return DataType.get_smallest_possible(extreme_value) - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index c5041acd46a63880160f7726946e1c609642710d..f307be95c30d822dfc517e4c331bd8d82d727997 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -29,8 +29,9 @@ import numpy as np import os import subprocess +import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -107,10 +108,18 @@ class HLSCustomOp(CustomOp): # ID of FPGA device to which this Op is allocated, in # a multi-FPGA setting "device_id": ("i", False, 0), - # input and output FIFO depths - "inFIFODepth": ("i", False, 2), - "outFIFODepth": ("i", False, 2), + # input and output FIFO depths for multi-I/O nodes + "inFIFODepths": ("ints", False, [2]), + "outFIFODepths": ("ints", False, [2]), "output_hook": ("s", False, ""), + # accumulated characteristic function over two periods + "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)), + "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)), + # the period for which the characterization was run + "io_chrc_period": ("i", False, 0), + # amount of zero padding inserted during chrc. + "io_chrc_pads_in": ("ints", False, []), + "io_chrc_pads_out": ("ints", False, []), } def get_verilog_top_module_name(self): @@ -688,40 +697,48 @@ compilation transformations? HLSCustomOp class but has to be filled by every node.""" pass - def get_normal_input_shape(self): + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input stream ind.""" + raise Exception("get_input_datatype not implemented for this op") + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output stream ind.""" + raise Exception("get_output_datatype not implemented for this op") + + def get_normal_input_shape(self, ind=0): """Returns normal input shape if implemented.""" raise Exception("get_normal_input_shape not implemented for this op") - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): """Returns folded output shape if implemented.""" raise Exception("get_normal_output_shape not implemented for this op") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): """Returns folded input shape (according to synapse folding), if implemented.""" raise Exception("get_folded_input_shape not implemented for this op") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): """Returns folded output shape (according to neuron folding), if implemented.""" raise Exception("get_folded_output_shape not implemented for this op") - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width, if implemented.""" raise Exception("get_instream_width not implemented for this op") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width, if implemented.""" raise Exception("get_outstream_width not implemented for this op") - def get_instream_width_padded(self): + def get_instream_width_padded(self, ind=0): """Returns input stream width padded to a multiple of 8. This is required by the AXI Stream spec.""" - in_width = self.get_instream_width() + in_width = self.get_instream_width(ind=ind) return roundup_to_integer_multiple(in_width, 8) - def get_outstream_width_padded(self): + def get_outstream_width_padded(self, ind=0): """Returns output stream width padded to a multiple of 8. This is required by the AXI Stream spec.""" - out_width = self.get_outstream_width() + out_width = self.get_outstream_width(ind=ind) return roundup_to_integer_multiple(out_width, 8) def get_ap_int_max_w(self): @@ -734,3 +751,119 @@ compilation transformations? "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret ) return ret + + def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): + """Return the unconstrained characteristic functions for this node.""" + # ensure rtlsim is ready + assert self.get_nodeattr("rtlsim_so") != "", ( + "rtlsim not ready for " + self.onnx_node.name + ) + if self.get_nodeattr("io_chrc_period") > 0: + warnings.warn( + "Skipping node %s: already has FIFO characteristic" + % self.onnx_node.name + ) + return + exp_cycles = self.get_exp_cycles() + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + n_outs = np.prod(self.get_folded_output_shape()[:-1]) + if exp_cycles == 0: + # try to come up with an optimistic estimate + exp_cycles = min(n_inps, n_outs) + assert ( + exp_cycles <= period + ), "Period %d too short to characterize %s : expects min %d cycles" % ( + period, + self.onnx_node.name, + exp_cycles, + ) + sim = self.get_rtlsim() + # signal name + sname = "_" + self.hls_sname() + "_" + if override_rtlsim_dict is not None: + io_dict = override_rtlsim_dict + else: + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + + # extra dicts to keep track of cycle-by-cycle transaction behavior + # note that we restrict key names to filter out weight streams etc + txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} + txns_out = { + key: [] for (key, value) in io_dict["outputs"].items() if "out" in key + } + + def monitor_txns(sim_obj): + for inp in txns_in: + in_ready = _read_signal(sim, inp + sname + "TREADY") == 1 + in_valid = _read_signal(sim, inp + sname + "TVALID") == 1 + if in_ready and in_valid: + txns_in[inp].append(1) + else: + txns_in[inp].append(0) + for outp in txns_out: + if ( + _read_signal(sim, outp + sname + "TREADY") == 1 + and _read_signal(sim, outp + sname + "TVALID") == 1 + ): + txns_out[outp].append(1) + else: + txns_out[outp].append(0) + + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + n_outs, + sname=sname, + liveness_threshold=period, + hook_preclk=monitor_txns, + ) + assert ( + total_cycle_count <= period + ), """Total cycle count from rtl simulation is higher than + specified period, please set the period higher than {}""".format( + total_cycle_count + ) + self.set_nodeattr("io_chrc_period", period) + + def accumulate_char_fxn(chrc): + p = len(chrc) + ret = [] + for t in range(2 * p): + if t == 0: + ret.append(chrc[0]) + else: + ret.append(ret[-1] + chrc[t % p]) + return np.asarray(ret, dtype=np.int32) + + all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32) + all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) + all_pad_in = [] + all_pad_out = [] + for in_idx, in_strm_nm in enumerate(txns_in.keys()): + txn_in = txns_in[in_strm_nm] + if len(txn_in) < period: + pad_in = period - len(txn_in) + txn_in += [0 for x in range(pad_in)] + txn_in = accumulate_char_fxn(txn_in) + all_txns_in[in_idx, :] = txn_in + all_pad_in.append(pad_in) + + for out_idx, out_strm_nm in enumerate(txns_out.keys()): + txn_out = txns_out[out_strm_nm] + if len(txn_out) < period: + pad_out = period - len(txn_out) + txn_out += [0 for x in range(pad_out)] + txn_out = accumulate_char_fxn(txn_out) + all_txns_out[out_idx, :] = txn_out + all_pad_out.append(pad_out) + + self.set_nodeattr("io_chrc_in", all_txns_in) + self.set_nodeattr("io_chrc_out", all_txns_out) + self.set_nodeattr("io_chrc_pads_in", all_pad_in) + self.set_nodeattr("io_chrc_pads_out", all_pad_out) diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index 33ee1d359c7b82494e1b5ce1b83aa5d0199f8153..65683079fc6a648de31148e398ea498f38b8d3d9 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -100,16 +100,16 @@ class IODMA(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): vecs = list(self.get_nodeattr("numInputVectors")) num_ch = self.get_nodeattr("NumChannels") ishape = tuple(vecs + [num_ch]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): if self.get_nodeattr("direction") == "in": raise ValueError("Folded input shape not defined for input IODMA") else: @@ -126,7 +126,7 @@ class IODMA(HLSCustomOp): shape.append(elems_per_word) return tuple(shape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): if self.get_nodeattr("direction") == "out": raise ValueError("Folded output shape not defined for output IODMA") else: @@ -166,15 +166,15 @@ class IODMA(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): if self.get_nodeattr("direction") == "in": return self.get_nodeattr("intfWidth") elif self.get_nodeattr("direction") == "out": @@ -182,7 +182,7 @@ class IODMA(HLSCustomOp): else: raise ValueError("Invalid IODMA direction, please set to in or out") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.get_nodeattr("direction") == "out": return self.get_nodeattr("intfWidth") elif self.get_nodeattr("direction") == "in": diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py index 3e27ee01113392174c1206fc10e1c9abe82fdfe7..03f89bd7ecac69a9097f4f35c42bd528be709515 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py +++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py @@ -70,13 +70,13 @@ class LabelSelect_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): nlabels = self.get_nodeattr("Labels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [nlabels]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): nlabels = self.get_nodeattr("Labels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -85,13 +85,13 @@ class LabelSelect_Batch(HLSCustomOp): folded_ishape = tuple(vecs + [folds, pe]) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k = self.get_nodeattr("K") vecs = list(self.get_nodeattr("numInputVectors")) oshape = tuple(vecs + [k]) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k = self.get_nodeattr("K") vecs = list(self.get_nodeattr("numInputVectors")) oshape = tuple(vecs + [k, 1]) @@ -152,24 +152,24 @@ class LabelSelect_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" ret = DataType[self.get_nodeattr("outputDataType")] return ret - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" return self.get_output_datatype().bitwidth() diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index 613a91b6284e0789dff2446e1615690a03336d99..fd3e2b5b1cfa74eb4f957df4b568e6c46da47617 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -75,21 +75,21 @@ class Lookup(HLSCustomOp): exp_cycles = int(n_inputs) return exp_cycles - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): return self.get_nodeattr("InputShape") - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ishape = self.get_normal_input_shape() emb_dim = self.get_nodeattr("EmbeddingDim") oshape = list(ishape) + [emb_dim] return tuple(oshape) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ishape = self.get_normal_input_shape() folded_ishape = list(ishape) + [1] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ishape = self.get_normal_input_shape() mem_mode = self.get_nodeattr("mem_mode") emb_dim = self.get_nodeattr("EmbeddingDim") @@ -135,19 +135,19 @@ class Lookup(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): ret = DataType[self.get_nodeattr("InputType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): ret = DataType[self.get_nodeattr("EmbeddingType")] return ret - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() return ibits - def get_outstream_width(self): + def get_outstream_width(self, ind=0): folded_oshape = self.get_folded_output_shape() obits = self.get_output_datatype().bitwidth() return obits * folded_oshape[-1] diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 9d2717dc8c65ddb5329816880067b81b10db2c02..69763fbea8a6079c7b0a61e14da37a3af69dfdfb 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -409,16 +409,16 @@ class MatrixVectorActivation(HLSCustomOp): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() in_width = i_bits * self.get_nodeattr("SIMD") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width @@ -474,7 +474,7 @@ class MatrixVectorActivation(HLSCustomOp): return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): mh = self.get_nodeattr("MH") pe = self.get_nodeattr("PE") nf = mh // pe @@ -482,13 +482,13 @@ class MatrixVectorActivation(HLSCustomOp): folded_output_shape = tuple(vecs + [nf, pe]) return folded_output_shape - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): mw = self.get_nodeattr("MW") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [mw]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): mh = self.get_nodeattr("MH") vecs = list(self.get_nodeattr("numInputVectors")) normal_output_shape = tuple(vecs + [mh]) @@ -1227,8 +1227,11 @@ class MatrixVectorActivation(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() ) - in_fifo_depth = self.get_nodeattr("inFIFODepth") - out_fifo_depth = self.get_nodeattr("outFIFODepth") + # TODO can we deprecate this entirely? this looks like legacy code + # that does not really serve a purpose - FIFO sizes are not typically + # allocated at this point; at best they are set to 2 as the default + in_fifo_depth = 2 + out_fifo_depth = 2 # insert depth pragmas only if specified if in_fifo_depth != 0: self.code_gen_dict["$PRAGMAS$"].append( @@ -1462,3 +1465,20 @@ class MatrixVectorActivation(HLSCustomOp): thres_count = out_features ret_dict[thres_param_type] = thres_count return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py index 3bf187fa9a78ed2c812f042a29079ee1e3163d74..91cd537baeff0c7666bbf3596b46a7412ec2fe4e 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -74,11 +74,11 @@ class Pool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("InputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" fxn = self.get_nodeattr("Function") odt = DataType[self.get_nodeattr("OutputDataType")] @@ -98,7 +98,7 @@ class Pool_Batch(HLSCustomOp): return odt - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_ch = self.get_nodeattr("Channels") odims = self.get_nodeattr("OutImgDims") batch_size = self.get_nodeattr("BatchSize") @@ -107,7 +107,7 @@ class Pool_Batch(HLSCustomOp): ishape = (batch_size, *odims, k_prod * ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -116,14 +116,14 @@ class Pool_Batch(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, pe] return tuple(folded_ishape) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ofm_ch = self.get_nodeattr("Channels") odims = self.get_nodeattr("OutImgDims") batch_size = self.get_nodeattr("BatchSize") oshape = (batch_size, *odims, ofm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -147,13 +147,13 @@ class Pool_Batch(HLSCustomOp): exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size return int(exp_cycles) - def get_instream_width(self): + def get_instream_width(self, ind=0): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = int(dt_bits * pe) return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): dt_bits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") out_width = int(dt_bits * pe) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py index 1e6b72e4d54ede639e797f32f51fb7705ec8ce4b..a3aa9d570d0efcbe82090d19a151d4f5b12078b6 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py @@ -60,19 +60,19 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("dataType")] - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ishape = self.get_nodeattr("shape") return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): oshape = self.get_nodeattr("shape") return oshape @@ -97,7 +97,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): Please adjust PE and SIMD values so that OutWidth % InWidth = 0 or alternatively use impl_style = vivado""" - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): self.check_divisible_iowidths() iwidth = self.get_nodeattr("inWidth") ishape = self.get_normal_input_shape() @@ -117,7 +117,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): dummy_t = dummy_t.reshape(new_shape) return dummy_t.shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): self.check_divisible_iowidths() owidth = self.get_nodeattr("outWidth") oshape = self.get_normal_output_shape() @@ -142,11 +142,11 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def get_instream_width(self): + def get_instream_width(self, ind=0): in_width = self.get_nodeattr("inWidth") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): out_width = self.get_nodeattr("outWidth") return out_width diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index a7c3cd0be59db4ba8665f8fba5be72282339b8c8..40d016de43820a37e8c7894a3e1f30146c667e59 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -68,11 +68,29 @@ class StreamingFIFO(HLSCustomOp): "auto", {"auto", "block", "distributed", "ultra"}, ), + # whether depth monitoring is enabled (impl_style=rtl only) + "depth_monitor": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) return my_attrs + def get_adjusted_depth(self): + impl = self.get_nodeattr("impl_style") + depth = self.get_nodeattr("depth") + if impl == "vivado": + old_depth = depth + # round up depth to nearest power-of-2 + # Vivado FIFO impl may fail otherwise + depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth + if old_depth != depth: + warnings.warn( + "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" + % (self.onnx_node.name, old_depth, depth) + ) + + return depth + def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() oshape = self.get_normal_output_shape() @@ -97,6 +115,14 @@ class StreamingFIFO(HLSCustomOp): def verify_node(self): pass + def get_verilog_top_module_intf_names(self): + ret = super().get_verilog_top_module_intf_names() + is_rtl = self.get_nodeattr("impl_style") == "rtl" + is_depth_monitor = self.get_nodeattr("depth_monitor") == 1 + if is_rtl and is_depth_monitor: + ret["ap_none"] = ["maxcount"] + return ret + def get_verilog_top_module_name(self): "Return the Verilog top module name for this node." @@ -180,10 +206,8 @@ class StreamingFIFO(HLSCustomOp): self.set_nodeattr("ip_vlnv", vlnv) self.code_gen_dict.clear() - def get_normal_input_shape(self): - depth = self.get_nodeattr("depth") - # depth has to be between 2 and 256 with the current - # StreamingFIFO implementation + def get_normal_input_shape(self, ind=0): + depth = self.get_adjusted_depth() assert depth >= 2, """Depth is too low""" if depth > 256 and self.get_nodeattr("impl_style") == "rtl": warnings.warn( @@ -211,22 +235,22 @@ class StreamingFIFO(HLSCustomOp): return normal_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_instream_width(self): + def get_instream_width(self, ind=0): dtype = DataType[self.get_nodeattr("dataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): dtype = DataType[self.get_nodeattr("dataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() @@ -328,7 +352,7 @@ class StreamingFIFO(HLSCustomOp): elif impl_style == "vivado": cmd = [] node_name = self.onnx_node.name - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() ram_style = self.get_nodeattr("ram_style") # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] @@ -393,7 +417,7 @@ class StreamingFIFO(HLSCustomOp): """Calculates resource estimation for BRAM""" impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() if impl == "rtl" or (impl == "vivado" and ram_type != "block"): @@ -418,7 +442,7 @@ class StreamingFIFO(HLSCustomOp): impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() if impl == "rtl" or (impl == "vivado" and ram_type != "ultra"): @@ -428,7 +452,7 @@ class StreamingFIFO(HLSCustomOp): return (math.ceil(depth / 4096)) * (math.ceil(W / 72)) def bram_efficiency_estimation(self): - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() bram16_est = self.bram_estimation() if bram16_est == 0: @@ -441,7 +465,7 @@ class StreamingFIFO(HLSCustomOp): """Calculates resource estimations for LUTs""" impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() address_luts = 2 * math.ceil(math.log(depth, 2)) diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py index 882b40a0aaf542e6dcaf427ca3567ae78394ede5..a0e60931edd8590aaebc0560c4bd28d61d62e8ea 100755 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py @@ -57,11 +57,11 @@ class StreamingMaxPool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("dataType")] @@ -82,13 +82,13 @@ class StreamingMaxPool_Batch(HLSCustomOp): ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() return (ifm_dim[0] == 1) and (k[0] == 1) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") ifm_ch = self.get_nodeattr("NumChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") ifm_ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") @@ -99,7 +99,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") k_h, k_w = tuple(self.get_nodeattr("PoolDim")) ifm_ch = self.get_nodeattr("NumChannels") @@ -116,7 +116,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # even though there is no folding in the current hlslib op, # insert a time multiplexing axis to remain compatible with the # shapes produced by the rest of the dataflow pipeline @@ -155,7 +155,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): # TODO: adjust inaccurate formula return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) - def get_instream_width(self): + def get_instream_width(self, ind=0): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") ifm_ch = self.get_nodeattr("NumChannels") @@ -165,7 +165,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): in_width = int(dt_bits * ifm_ch) return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """For streaming maxpool out stream width is the same as in stream width""" return self.get_instream_width() diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -319,6 +319,7 @@ module $TOPNAME$( ap_clk, ap_rst_n, count, +maxcount, in0_$HLS_SNAME$_TDATA, in0_$HLS_SNAME$_TVALID, in0_$HLS_SNAME$_TREADY, @@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY input ap_clk; input ap_rst_n; output $COUNT_RANGE$ count; +output $COUNT_RANGE$ maxcount; input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; input in0_$HLS_SNAME$_TVALID; output in0_$HLS_SNAME$_TREADY; @@ -346,6 +348,7 @@ $LAYER_NAME$ .clock(ap_clk), .reset(!ap_rst_n), .count(count), + .maxcount(maxcount), .i_d(in0_$HLS_SNAME$_TDATA), .i_v(in0_$HLS_SNAME$_TVALID), .i_r(in0_$HLS_SNAME$_TREADY), diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index 5383cc1f4bdf9eb88c7d7bd69c25231282f11c6f..f2cc64668d62ef15446772309577e9b15a378ef5 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -75,9 +75,6 @@ class Thresholding_Batch(HLSCustomOp): "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - # input and output FIFO depths - "inFIFODepth": ("i", False, 0), - "outFIFODepth": ("i", False, 0), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -185,11 +182,11 @@ class Thresholding_Batch(HLSCustomOp): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] @@ -221,11 +218,11 @@ class Thresholding_Batch(HLSCustomOp): self.set_nodeattr("weightDataType", tdt.name) return DataType[self.get_nodeattr("weightDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") @@ -251,7 +248,7 @@ class Thresholding_Batch(HLSCustomOp): weightstream = self.get_weightstream_width() return max([weightstream, temp_value]) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") fold = ich // pe @@ -259,17 +256,17 @@ class Thresholding_Batch(HLSCustomOp): folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # same shape as input return self.get_folded_input_shape() - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [ich]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() @@ -960,3 +957,20 @@ class Thresholding_Batch(HLSCustomOp): "Return a list of extra tcl directives for HLS synthesis." return ["config_compile -pipeline_style frp"] + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_tmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 7386aa7e6311754b653e94f8d2e9b2a910a1370b..1bd32442a1986d6a86571e85a09322d6c15d8a78 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -218,21 +218,21 @@ class TLastMarker(HLSCustomOp): def get_number_output_values(self): return self.get_nodeattr("NumIters") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") elem_width = self.get_nodeattr("ElemWidth") n_packed_elems = stream_width // elem_width n_iters = self.get_nodeattr("NumIters") return (1, n_iters, n_packed_elems) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() - def get_instream_width(self): + def get_instream_width(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") return stream_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") return stream_width diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index eb51fe39fc6e7ec84204f9d541a0e47c333bbf43..a018fd35aac4d63b365e97464dab0fd4a5fa13f2 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -73,7 +73,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): exp_cycles = OFMDim * reps return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): IFMDim = self.get_nodeattr("IFMDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") @@ -84,7 +84,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): ishape = (batch, IFMDim, 1, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): OFMDim = self.get_nodeattr("OFMDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") @@ -95,11 +95,11 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): oshape = (batch, OFMDim, 1, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) return tuple(normal_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) return tuple(normal_oshape) @@ -129,21 +129,21 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() ifm_ch = self.get_nodeattr("NumChannels") return ibits * ifm_ch - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() ifm_ch = self.get_nodeattr("NumChannels") return obits * ifm_ch diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index b0c05d1ad6c74ceaaaa2c932f4add3f0076bda51..0375bdea68f6c10eda8a3c5f375bbb14bc9a2be5 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -208,7 +208,7 @@ class VectorVectorActivation(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] @@ -216,16 +216,16 @@ class VectorVectorActivation(HLSCustomOp): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() in_width = i_bits * self.get_nodeattr("PE") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width @@ -249,7 +249,7 @@ class VectorVectorActivation(HLSCustomOp): return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe @@ -257,14 +257,14 @@ class VectorVectorActivation(HLSCustomOp): folded_output_shape = tuple([1, dim_h, dim_w, nf, pe]) return folded_output_shape - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") k_h, k_w = self.get_nodeattr("Kernel") normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ch = self.get_nodeattr("Channels") dim_h, dim_w = self.get_nodeattr("Dim") normal_output_shape = tuple([1, dim_h, dim_w, ch]) @@ -901,8 +901,11 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() ) - in_fifo_depth = self.get_nodeattr("inFIFODepth") - out_fifo_depth = self.get_nodeattr("outFIFODepth") + # TODO can we deprecate this entirely? this looks like legacy code + # that does not really serve a purpose - FIFO sizes are not typically + # allocated at this point; at best they are set to 2 as the default + in_fifo_depth = 2 + out_fifo_depth = 2 # insert depth pragmas only if specified if in_fifo_depth != 0: self.code_gen_dict["$PRAGMAS$"].append( @@ -1254,3 +1257,20 @@ class VectorVectorActivation(HLSCustomOp): thres_count = fm ret_dict[thres_param_type] = thres_count return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/qnn-data/build_dataflow/expected_output.npy b/src/finn/qnn-data/build_dataflow/expected_output.npy index a8d09384633791b7e3760dc8a2d1ba88a05d526d..98037351bb4ee49985a98631750f18e9b86965b1 100644 Binary files a/src/finn/qnn-data/build_dataflow/expected_output.npy and b/src/finn/qnn-data/build_dataflow/expected_output.npy differ diff --git a/src/finn/qnn-data/build_dataflow/input.npy b/src/finn/qnn-data/build_dataflow/input.npy index edd24de05a33a15ebc330cdab31f3d77d2c47196..8bece67b7daf5b7668ff5e7515f15a891146b00b 100644 Binary files a/src/finn/qnn-data/build_dataflow/input.npy and b/src/finn/qnn-data/build_dataflow/input.npy differ diff --git a/src/finn/qnn-data/testcase/residual_testcase.onnx b/src/finn/qnn-data/testcase/residual_testcase.onnx new file mode 100644 index 0000000000000000000000000000000000000000..c96e8c694e3a39cdb9e5d984e1c069ceb55b3f2a Binary files /dev/null and b/src/finn/qnn-data/testcase/residual_testcase.onnx differ diff --git a/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh new file mode 100644 index 0000000000000000000000000000000000000000..1c8b6403e8628e3647810ca5fca65ca1122eaf9d --- /dev/null +++ b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh @@ -0,0 +1,346 @@ +// (c) Copyright 2011-2013 Xilinx, Inc. All rights reserved. +// +// This file contains confidential and proprietary information +// of Xilinx, Inc. and is protected under U.S. and +// international copyright and other intellectual property +// laws. +// +// DISCLAIMER +// This disclaimer is not a license and does not grant any +// rights to the materials distributed herewith. Except as +// otherwise provided in a valid license issued to you by +// Xilinx, and to the maximum extent permitted by applicable +// law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +// WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +// AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +// BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +// INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +// (2) Xilinx shall not be liable (whether in contract or tort, +// including negligence, or under any other theory of +// liability) for any loss or damage of any kind or nature +// related to, arising under or in connection with these +// materials, including for any direct, or any indirect, +// special, incidental, or consequential loss or damage +// (including loss of data, profits, goodwill, or any type of +// loss or damage suffered as a result of any action brought +// by a third party) even if such damage or loss was +// reasonably foreseeable or Xilinx had been advised of the +// possibility of the same. +// +// CRITICAL APPLICATIONS +// Xilinx products are not designed or intended to be fail- +// safe, or for use in any application requiring fail-safe +// performance, such as life-support or safety devices or +// systems, Class III medical devices, nuclear facilities, +// applications related to the deployment of airbags, or any +// other applications that could lead to death, personal +// injury, or severe property or environmental damage +// (individually and collectively, "Critical +// Applications"). Customer assumes the sole risk and +// liability of any use of Xilinx products in Critical +// Applications, subject only to applicable laws and +// regulations governing limitations on product liability. +// +// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +// PART OF THIS FILE AT ALL TIMES. +//----------------------------------------------------------------------------- +// +// Generic Functions used by AXIS-Interconnect and Infrastrucutre Modules +// +// Verilog-standard: Verilog 2001 +//-------------------------------------------------------------------------- +// Global Parameters: +// +// Functions: +// f_clogb2 +// f_gcd +// f_lcm +// f_get_tdata_indx +// f_get_tstrb_indx +// f_get_tkeep_indx +// f_get_tlast_indx +// f_get_tid_indx +// f_get_tdest_indx +// f_get_tuser_indx +// f_payload_width +// Tasks: +// t_display_tdata_error +//-------------------------------------------------------------------------- +/////////////////////////////////////////////////////////////////////////////// +// BEGIN Global Parameters +/////////////////////////////////////////////////////////////////////////////// +// Define Signal Set indices +localparam G_INDX_SS_TREADY = 0; +localparam G_INDX_SS_TDATA = 1; +localparam G_INDX_SS_TSTRB = 2; +localparam G_INDX_SS_TKEEP = 3; +localparam G_INDX_SS_TLAST = 4; +localparam G_INDX_SS_TID = 5; +localparam G_INDX_SS_TDEST = 6; +localparam G_INDX_SS_TUSER = 7; +localparam G_MASK_SS_TREADY = 32'h1 << G_INDX_SS_TREADY; +localparam G_MASK_SS_TDATA = 32'h1 << G_INDX_SS_TDATA; +localparam G_MASK_SS_TSTRB = 32'h1 << G_INDX_SS_TSTRB; +localparam G_MASK_SS_TKEEP = 32'h1 << G_INDX_SS_TKEEP; +localparam G_MASK_SS_TLAST = 32'h1 << G_INDX_SS_TLAST; +localparam G_MASK_SS_TID = 32'h1 << G_INDX_SS_TID ; +localparam G_MASK_SS_TDEST = 32'h1 << G_INDX_SS_TDEST; +localparam G_MASK_SS_TUSER = 32'h1 << G_INDX_SS_TUSER; + +// Task DRC error levels +localparam G_TASK_SEVERITY_ERR = 2; +localparam G_TASK_SEVERITY_WARNING = 1; +localparam G_TASK_SEVERITY_INFO = 0; + +/////////////////////////////////////////////////////////////////////////////// +// BEGIN Functions +/////////////////////////////////////////////////////////////////////////////// +// ceiling logb2 + function integer f_clogb2 (input integer size); + integer s; + begin + s = size; + s = s - 1; + for (f_clogb2=1; s>1; f_clogb2=f_clogb2+1) + s = s >> 1; + end + endfunction // clogb2 + + // Calculates the Greatest Common Divisor between two integers using the + // euclidean algorithm. + function automatic integer f_gcd ( + input integer a, + input integer b + ); + begin : main + integer A, B, done, swap; + A = a; + B = b; + done = 0; + while(!done) + begin + if (A < B ) begin + swap = A; + A = B; + B = swap; + end else if ( B != 0 ) begin + A = A - B; + end else begin + done = 1; + end + end + + f_gcd = A; + end + endfunction + + + // Calculates the Lowest Common Denominator between two integers + function integer f_lcm ( + input integer a, + input integer b + ); + begin : main + f_lcm = ( a / f_gcd(a, b)) * b; + end + endfunction + + // Returns back the index to the TDATA portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tdata_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + f_get_tdata_indx = 0; + end + endfunction + + // Returns back the index to the tstrb portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tstrb_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tdata_indx(DAW, IDW, DEW, USW, SST); + // If TDATA exists, then add its width to its base to get the tstrb index + f_get_tstrb_indx = SST[G_INDX_SS_TDATA] ? cur_indx + DAW : cur_indx; + end + endfunction + + // Returns back the index to the tkeep portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tkeep_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tstrb_indx(DAW, IDW, DEW, USW, SST); + f_get_tkeep_indx = SST[G_INDX_SS_TSTRB] ? cur_indx + DAW/8 : cur_indx; + end + endfunction + + // Returns back the index to the tlast portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tlast_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tkeep_indx(DAW, IDW, DEW, USW, SST); + f_get_tlast_indx = SST[G_INDX_SS_TKEEP] ? cur_indx + DAW/8 : cur_indx; + end + endfunction + + // Returns back the index to the tid portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tid_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tlast_indx(DAW, IDW, DEW, USW, SST); + f_get_tid_indx = SST[G_INDX_SS_TLAST] ? cur_indx + 1 : cur_indx; + end + endfunction + + // Returns back the index to the tdest portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tdest_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tid_indx(DAW, IDW, DEW, USW, SST); + f_get_tdest_indx = SST[G_INDX_SS_TID] ? cur_indx + IDW : cur_indx; + end + endfunction + + // Returns back the index to the tuser portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tuser_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tdest_indx(DAW, IDW, DEW, USW, SST); + f_get_tuser_indx = SST[G_INDX_SS_TDEST] ? cur_indx + DEW : cur_indx; + end + endfunction + + // Payload is the sum of all the AXIS signals present except for + // TREADY/TVALID + function integer f_payload_width ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tuser_indx(DAW, IDW, DEW, USW, SST); + f_payload_width = SST[G_INDX_SS_TUSER] ? cur_indx + USW : cur_indx; + // Ensure that the return value is never less than 1 + f_payload_width = (f_payload_width < 1) ? 1 : f_payload_width; + end + endfunction + + task t_check_tdata_width( + input integer data_width, + input [8*80-1:0] var_name, + input [8*80-1:0] inst_name, + input integer severity_lvl, + output integer ret_val + ); + // Severity levels: + // 0 = INFO + // 1 = WARNING + // 2 = ERROR + begin : t_check_tdata_width + if (data_width%8 != 0) begin + // 000 1 2 3 4 5 6 7 8 + // 012 0 0 0 0 0 0 0 0 + if (severity_lvl >= 2) begin + $display("ERROR: %m::%s", inst_name); + end else if (severity_lvl == 1) begin + $display("WARNING: %m::%s", inst_name); + end else begin + $display("INFO: %m::%s", inst_name); + end + $display(" Parameter %s (%2d) must be a multiple of 8.", var_name, data_width); + $display(" AXI4-Stream data width is only defined for byte multiples. See the "); + $display(" AMBA4 AXI4-Stream Protocol Specification v1.0 Section 2.1 for more"); + $display(" information."); + ret_val = 1; + end else begin + ret_val = 0; + end + end + endtask + + task t_check_tuser_width( + input integer tuser_width, + input [8*80-1:0] tuser_name, + input integer tdata_width, + input [8*80-1:0] tdata_name, + input [8*80-1:0] inst_name, + input integer severity_lvl, + output integer ret_val + ); + // Severity levels: + // 0 = INFO + // 1 = WARNING + // 2 = ERROR + begin : t_check_tuser_width + integer tdata_bytes; + tdata_bytes = tdata_width/8; + if ((tuser_width%tdata_bytes) != 0) begin + // 000 1 2 3 4 5 6 7 8 + // 012 0 0 0 0 0 0 0 0 + if (severity_lvl >= 2) begin + $display("ERROR: %m::%s", inst_name); + end else if (severity_lvl == 1) begin + $display("WARNING: %m::%s", inst_name); + end else begin + $display("INFO: %m::%s", inst_name); + end + $display(" Parameter %s == %2d is not the recommended value of 'an integer ", tuser_name, tuser_width); + $display(" multiple of the width of the interface (%s == %2d) in bytes.' AXI4-Stream", tdata_name, tdata_width); + $display(" TUSER width in this module is only defined when the TUSER is the"); + $display(" recommended value. See the AMBA4 AXI4-Stream Protocol Specification v1.0"); + $display(" Section 2.1, 2.3.3 and 2.8 for more information. "); + ret_val = 1; + end else begin + ret_val = 0; + end + end + endtask diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index b7db49eb22e0ccb6e3ffbf8ccad44d4274cb2154..7e4ab34af79c52a08e737f57b2fc8f017940bcf5 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1282,6 +1282,7 @@ class InferDuplicateStreamsLayer(Transformation): inputDataType=dt.name, numInputVectors=vecs, NumOutputStreams=n_outputs, + outFIFODepths=[2] * n_outputs, name="DuplicateStreams_Batch_" + node.name, ) @@ -1709,6 +1710,7 @@ class InferConcatLayer(Transformation): ElemsPerStream=elems_per_stream, inputDataType=dt0.name, numInputVectors=inp_vec, + inFIFODepths=[2] * len(node.input), ) graph.node.insert(node_ind, new_node) # remove old node diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 00e2cc3bb48bcb8b81ba4750382178a4e508bec6..52e4e88b409766f0764d3ce7666dbf1971713575 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -404,6 +404,7 @@ class CreateStitchedIP(Transformation): wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name) tcl.append("add_files -norecurse %s" % wrapper_filename) model.set_metadata_prop("wrapper_filename", wrapper_filename) + tcl.append("set_property top finn_design_wrapper [current_fileset]") # synthesize to DCP and export stub, DCP and constraints if self.vitis: tcl.append( @@ -582,6 +583,10 @@ class CreateStitchedIP(Transformation): if os.path.isfile(wrapper_filename_alt): model.set_metadata_prop("wrapper_filename", wrapper_filename_alt) else: - raise Exception("CreateStitchedIP failed, no wrapper HDL found.") + raise Exception( + """CreateStitchedIP failed, no wrapper HDL found under %s or %s. + Please check logs under the parent directory.""" + % (wrapper_filename, wrapper_filename_alt) + ) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py new file mode 100644 index 0000000000000000000000000000000000000000..822679721036c7832241db4642911ff804fb9dff --- /dev/null +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -0,0 +1,190 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import qonnx.custom_op.registry as registry +import warnings +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.base import NodeLocalTransformation + +from finn.util.fpgadataflow import is_fpgadataflow_node + + +class DeriveCharacteristic(NodeLocalTransformation): + """For each node in the graph, run rtlsim to obtain the i/o + characteristic function for FIFO sizing and set the attribute. + It is assumed that the PrepareRTLSim transformation was already + called on the graph. + + This transformation performs rtlsim for each node, so it will run for + some time (minutes to hours depending on configuration). + + * period (int) desired period over which the characteristic function + will be derived. + + * num_workers (int or None) number of parallel workers, see documentation in + NodeLocalTransformation for more details. + """ + + def __init__(self, period, num_workers=None, manual_bypass=False): + super().__init__(num_workers=num_workers) + self.period = period + self.manual_bypass = manual_bypass + + def applyNodeLocal(self, node): + op_type = node.op_type + if is_fpgadataflow_node(node) is True: + try: + # lookup op_type in registry of CustomOps + inst = registry.getCustomOp(node) + inst.derive_characteristic_fxns(period=self.period) + except KeyError: + # exception if op_type is not supported + raise Exception( + "Custom op_type %s is currently not supported." % op_type + ) + return (node, False) + + def apply(self, model: ModelWrapper): + (model, run_again) = super().apply(model) + if not self.manual_bypass: + return (model, run_again) + # apply manual fix for DuplicateStreams and AddStreams for + # simple residual reconvergent paths with bypass + addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + for addstrm_node in addstrm_nodes: + # we currently only support the case where one branch is + # a bypass + b0 = model.find_producer(addstrm_node.input[0]) + b1 = model.find_producer(addstrm_node.input[1]) + if (b0 is None) or (b1 is None): + warnings.warn("Found unsupported AddStreams, skipping") + return (model, run_again) + b0_is_bypass = b0.op_type == "DuplicateStreams_Batch" + b1_is_bypass = b1.op_type == "DuplicateStreams_Batch" + if (not b0_is_bypass) and (not b1_is_bypass): + warnings.warn("Found unsupported AddStreams, skipping") + return (model, run_again) + ds_node = b0 if b0_is_bypass else b1 + comp_branch_last = b1 if b0_is_bypass else b0 + + ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1] + comp_branch_first = model.find_consumer(ds_comp_bout) + if comp_branch_first is None or comp_branch_last is None: + warnings.warn("Found unsupported DuplicateStreams, skipping") + return (model, run_again) + comp_branch_last = registry.getCustomOp(comp_branch_last) + comp_branch_first = registry.getCustomOp(comp_branch_first) + # for DuplicateStreams, use comp_branch_first's input characterization + # for AddStreams, use comp_branch_last's output characterization + period = comp_branch_first.get_nodeattr("io_chrc_period") + comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[ + : 2 * period + ] + comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[ + 2 * period : + ] + ds_node_inst = registry.getCustomOp(ds_node) + addstrm_node_inst = registry.getCustomOp(addstrm_node) + ds_node_inst.set_nodeattr("io_chrc_period", period) + ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2) + addstrm_node_inst.set_nodeattr("io_chrc_period", period) + addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2) + warnings.warn( + f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}" + ) + warnings.warn( + f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}" + ) + return (model, run_again) + + +class DeriveFIFOSizes(NodeLocalTransformation): + """Prerequisite: DeriveCharacteristic already called on graph. + For each node in the graph, use the accumulated I/O characteristic function + to perform FIFO sizing, setting the in/outFIFODepth attributes of HLSCustomOp + nodes. + + * num_workers (int or None) number of parallel workers, see documentation in + NodeLocalTransformation for more details. + """ + + def __init__(self, num_workers=None): + super().__init__(num_workers=num_workers) + + def applyNodeLocal(self, node): + op_type = node.op_type + if is_fpgadataflow_node(node) is True: + try: + # lookup op_type in registry of CustomOps + prod = registry.getCustomOp(node) + assert op_type != "StreamingFIFO", "Found existing FIFOs" + period = prod.get_nodeattr("io_chrc_period") + prod_chrc = prod.get_nodeattr("io_chrc_out")[0] + assert ( + len(prod_chrc) == 2 * period + ), "Found unexpected characterization attribute" + if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]): + # FIFO depth already set, can skip this node + return (node, False) + + # find consumers + model = self.ref_input_model + out_fifo_depths = [] + for output_name in node.output: + cons_node = model.find_consumer(output_name) + if cons_node is None: + # could be final node, will be overridden if so + # need an entry in the list anyway + out_fifo_depths.append(2) + continue + cons = registry.getCustomOp(cons_node) + cons_chrc = cons.get_nodeattr("io_chrc_in")[0] + # find minimum phase shift satisfying the constraint + pshift_min = period - 1 + for pshift_cand in range(period): + prod_chrc_part = prod_chrc[pshift_cand:period] + cons_chrc_part = cons_chrc[: period - pshift_cand] + if (prod_chrc_part >= cons_chrc_part).all(): + pshift_min = pshift_cand + break + prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period)] + cons_chrc_part = cons_chrc[:period] + fifo_depth = int((prod_chrc_part - cons_chrc_part).max()) + out_fifo_depths.append(fifo_depth) + # set output FIFO depth for this (producing) node + # InsertFIFO looks at the max of (outFIFODepth, inFIFODepth) + # for each tensor + prod.set_nodeattr("outFIFODepths", out_fifo_depths) + + except KeyError: + # exception if op_type is not supported + raise Exception( + "Custom op_type %s is currently not supported." % op_type + ) + return (node, False) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..efc179923545eb06e4d173c683b0941887f8bb79 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -81,6 +81,12 @@ class InsertDWC(Transformation): dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() + larger_width = max(dwc_in_width, dwc_out_width) + smaller_width = min(dwc_in_width, dwc_out_width) + if larger_width % smaller_width == 0: + impl_style = "hls" + else: + impl_style = "vivado" # determine shape for dwc dwc_shape = n0.get_normal_output_shape() @@ -105,6 +111,7 @@ class InsertDWC(Transformation): inWidth=dwc_in_width, outWidth=dwc_out_width, dataType=str(dtype.name), + impl_style=impl_style, ) # insert dwc graph.node.insert(node_ind + 1, dwc_node) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 78200b280960ad53e3e84d44394c10296c432ba5..79bd717a5d96e7a9839740d73254db53e5133e13 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -70,16 +70,26 @@ class InsertFIFO(Transformation): node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth' of the subsequent node. max() of these two values sets the FIFO depth. - Normally, shallow-depth (<=2) FIFOs won't be created since HLS streaming - interfaces already have a degree of buffering. You can set - create_shallow_fifos=True to override this default behavior. + Constructor arguments: + - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of + Verilog FIFOs (Q_srl.v) + - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for + large FIFOs implemented by Vivado + - create_shallow_fifos : Normally, shallow-depth (<=2) FIFOs won't be created since + HLS streaming interfaces already have a degree of buffering. + Override with this parameter. + The other node attributes necessary to create a FIFO node are taken from the node the FIFO node is inserted after: 'folded_shape' and 'dtype'""" - def __init__(self, create_shallow_fifos=False): + def __init__( + self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto" + ): super().__init__() self.create_shallow_fifos = create_shallow_fifos + self.max_qsrl_depth = max_qsrl_depth + self.vivado_ram_style = vivado_ram_style def apply(self, model): graph = model.graph @@ -88,8 +98,8 @@ class InsertFIFO(Transformation): for first_node in graph.node: node_ind += 1 if _suitable_node(first_node): - for n_output in first_node.output: - consumers = model.find_consumers(n_output) + for idx_out, output_name in enumerate(first_node.output): + consumers = model.find_consumers(output_name) if consumers == []: continue if len(consumers) > 1: @@ -108,11 +118,9 @@ class InsertFIFO(Transformation): # input of the second node is equal n1 = getCustomOp(consumer) for idx, inp in enumerate(consumer.input): - if inp == n_output: - if idx == 0: - fld_shape_2 = n1.get_folded_input_shape() - else: - fld_shape_2 = n1.get_folded_input_shape(ind=idx) + if inp == output_name: + fld_shape_2 = n1.get_folded_input_shape(ind=idx) + idx_inp = idx assert _suitable_folded_shapes( fld_shape, fld_shape_2 ), """The @@ -122,12 +130,10 @@ class InsertFIFO(Transformation): # check if outFIFOdepth attribute of first node # and inFIFOdepth attribute of consumer node is equal - n0_depth = n0.get_nodeattr("outFIFODepth") - n1_depth = n1.get_nodeattr("inFIFODepth") - if n0_depth == n1_depth: - fifo_depth = n0_depth - elif n0_depth != n1_depth: - fifo_depth = max(n0_depth, n1_depth) + n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out] + n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp] + + fifo_depth = max(n0_depth, n1_depth) if fifo_depth > 2 or self.create_shallow_fifos: # assumption: HLS streaming components already have @@ -143,25 +149,40 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) + if ( + self.max_qsrl_depth is None + or fifo_depth <= self.max_qsrl_depth + ): + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", - [n_output], + [output_name], [fifo_output_tensor.name], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.insert(node_ind + 1, fifo_node) # set fifo output tensor as new input tensor of second node for idx, inp in enumerate(consumer.input): - if inp == n_output: + if inp == output_name: consumer.input[idx] = fifo_output_tensor.name # ensure created FIFO depth is reflected on both sides - n0.set_nodeattr("outFIFODepth", fifo_depth) - n1.set_nodeattr("inFIFODepth", fifo_depth) + odepths = n0.get_nodeattr("outFIFODepths") + odepths[idx_out] = fifo_depth + n0.set_nodeattr("outFIFODepths", odepths) + idepths = n1.get_nodeattr("inFIFODepths") + idepths[idx_inp] = fifo_depth + n1.set_nodeattr("inFIFODepths", idepths) + graph_modified = True if graph_modified is False: @@ -177,13 +198,9 @@ class InsertFIFO(Transformation): n_input = first_node.input[inp_ind] n0 = getCustomOp(first_node) # determine fifo node attributes - if inp_ind == 0: - fld_shape = n0.get_folded_input_shape() - dtype = n0.get_input_datatype() - else: - fld_shape = n0.get_folded_input_shape(inp_ind) - dtype = n0.get_input_datatype(inp_ind) - fifo_depth = n0.get_nodeattr("inFIFODepth") + fld_shape = n0.get_folded_input_shape(inp_ind) + dtype = n0.get_input_datatype(inp_ind) + fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind] if fifo_depth <= 2: warnings.warn("Overriding input FIFO depth to 32") @@ -198,6 +215,11 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) + if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth: + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", [n_input], @@ -207,6 +229,8 @@ class InsertFIFO(Transformation): depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.insert(0, fifo_node) @@ -227,10 +251,11 @@ class InsertFIFO(Transformation): ), """Insert tlast marker should be done after inserting the FIFOs""" n0 = getCustomOp(final_node) + out_ind = list(final_node.output).index(graph_out_name) # determine fifo node attributes - fld_shape = n0.get_folded_output_shape() - dtype = n0.get_output_datatype() - fifo_depth = n0.get_nodeattr("outFIFODepth") + fld_shape = n0.get_folded_output_shape(out_ind) + dtype = n0.get_output_datatype(out_ind) + fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind] if fifo_depth <= 2: warnings.warn("Overriding output FIFO depth to 32") @@ -245,6 +270,11 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_input_tensor) model.set_tensor_datatype(fifo_input_tensor.name, dtype) + if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth: + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", [fifo_input_tensor.name], @@ -254,6 +284,8 @@ class InsertFIFO(Transformation): depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.append(fifo_node) diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 4b4eb6362faf641def057afadfa7b5e019f54698..28bcd9598af34072cc854fdf23778bef778bd985 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -211,7 +211,8 @@ class InsertIODMA(Transformation): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type == "MatrixVectorActivation" + lambda x: x.op_type + in ["MatrixVectorActivation", "VectorVectorActivation"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, @@ -259,6 +260,10 @@ class InsertIODMA(Transformation): ) fc_node.input[1] = fc_node_in.name model.graph.node.insert(0, dma_node) + # expand inFIFODepths for new second input of node + infifo_depth = fc_inst.get_nodeattr("inFIFODepths") + infifo_depth.append(8) + fc_inst.set_nodeattr("inFIFODepths", infifo_depth) modified = True if modified: model = model.transform(SortGraph()) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..f715aaeffb6d4d00f2e14c5fb25ec931443d5d97 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -192,10 +192,11 @@ class InsertAndSetFIFODepths(Transformation): - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of Verilog FIFOs (Q_srl.v) - max_depth : how deep the "max"-sized FIFOs initially inserted will be + if set to None, use the tensor size as the depth - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs smaller where appropriate - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for - large FIFOs implemented by Vivado + large FIFOs implemented by Vivado afterwards Assumed input graph properties: - all nodes are fpgadataflow nodes @@ -210,7 +211,7 @@ class InsertAndSetFIFODepths(Transformation): necessary to insert FIFOs between them to prevent stalls due to bursty behavior. The sizes of those FIFOs are hard to predict analytically, so we do the following: - - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes + - insert deep (=tensor size) FIFOs between all fpgadataflow nodes - create stitched design - run through rtlsim with stream of multiple random input images (to fill pipeline) - keep track of observed maximum occupancy for each FIFO during rtlsim @@ -223,7 +224,7 @@ class InsertAndSetFIFODepths(Transformation): fpgapart, clk_ns=10.0, max_qsrl_depth=256, - max_depth=2**14, + max_depth=None, swg_exception=True, vivado_ram_style="auto", ): @@ -236,6 +237,9 @@ class InsertAndSetFIFODepths(Transformation): self.vivado_ram_style = vivado_ram_style def apply(self, model): + # these optypes may potentially use external weights + # we'll temporarily change them to use decoupled mode for FIFO sizing + extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] @@ -246,9 +250,22 @@ class InsertAndSetFIFODepths(Transformation): ) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) - node.set_nodeattr("inFIFODepth", self.max_depth) - node.set_nodeattr("outFIFODepth", self.max_depth) - if node.onnx_node.op_type == "MatrixVectorActivation": + ifd = node.get_nodeattr("inFIFODepths") + ofd = node.get_nodeattr("outFIFODepths") + if self.max_depth is not None: + ifd = [self.max_depth] * len(ifd) + ofd = [self.max_depth] * len(ofd) + else: + # set each FIFO to its tensor size + # (except stream width hence the :-1) + for i in range(len(ifd)): + ifd[i] = np.prod(node.get_folded_input_shape(i)[:-1]) + for o in range(len(ofd)): + ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1]) + node.set_nodeattr("inFIFODepths", ifd) + node.set_nodeattr("outFIFODepths", ofd) + + if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -267,13 +284,17 @@ class InsertAndSetFIFODepths(Transformation): # gather FIFO names, check they are of expected depth fifos = {} - for node in model.graph.node: - if node.op_type == "StreamingFIFO": - fifos[node.name] = 0 - node = getCustomOp(node) - # check depths and fix as necessary - if node.get_nodeattr("depth") != self.max_depth: - node.set_nodeattr("depth", self.max_depth) + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + for node in fifo_nodes: + fifos[node.name] = 0 + node = getCustomOp(node) + node.set_nodeattr("depth_monitor", 1) + node.set_nodeattr("impl_style", "rtl") + # check depths and fix as necessary + if (self.max_depth is not None) and ( + node.get_nodeattr("depth") != self.max_depth + ): + node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) @@ -324,21 +345,6 @@ class InsertAndSetFIFODepths(Transformation): else: set_signal(sim, "tvalid", 0) - # check/update all fifo counts - for key in fifos: - current_state = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["state"] - current_addr = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["addr"] - if current_state == 2: - current_count = current_addr + 2 - else: - current_count = current_state - if current_count > fifos[key]: - fifos[key] = current_count - # since latency estimation is very pessimistic, detect first output # and fast-forward the sim if get_signal(sim, "tvalid") != 0 and not output_detected: @@ -352,6 +358,12 @@ class InsertAndSetFIFODepths(Transformation): "No output detected, calculated FIFO depths may not be correct" ) + for ind, node in enumerate(fifo_nodes): + maxcount_name = "maxcount_%d" % ind + if ind == 0: + maxcount_name = "maxcount" + fifos[node.name] = sim[maxcount_name] + # Apply depths back into the model; # also set in/outFIFODepth to zero for non-FIFO # nodes, preventing further FIFO insertion @@ -364,6 +376,7 @@ class InsertAndSetFIFODepths(Transformation): depth = optimize_depth(fifos[node.name]) node_inst = getCustomOp(node) node_inst.set_nodeattr("depth", depth) + node_inst.set_nodeattr("depth_monitor", 0) # Set FIFO implementation/ram styles if depth > self.max_qsrl_depth: node_inst.set_nodeattr("impl_style", "vivado") @@ -374,11 +387,14 @@ class InsertAndSetFIFODepths(Transformation): reset_implementation(node_inst) del fifos[node.name] else: - getCustomOp(node).set_nodeattr("inFIFODepth", 0) - getCustomOp(node).set_nodeattr("outFIFODepth", 0) - # for every FC node we changed from external to decoupled, + inst = getCustomOp(node) + ifd = inst.get_nodeattr("inFIFODepths") + ofd = inst.get_nodeattr("outFIFODepths") + inst.set_nodeattr("inFIFODepths", [0] * len(ifd)) + inst.set_nodeattr("outFIFODepths", [0] * len(ofd)) + # for every extw node we changed from external to decoupled, # change back and reset implementation - if node.op_type == "MatrixVectorActivation": + if node.op_type in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py index f6a51da8e44ea60ae5693cdd033b39bdf51376ac..d7ed3e261fe024b7f054382f12184628d3f3e94c 100644 --- a/src/finn/util/pyverilator.py +++ b/src/finn/util/pyverilator.py @@ -26,7 +26,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pkg_resources as pk + import os +import shutil from pyverilator import PyVerilator from finn.util.basic import get_rtlsim_trace_depth, make_build_dir @@ -74,14 +77,35 @@ def pyverilate_stitched_ip( # are identical but in multiple directories (regslice_core.v) # remove duplicates from list by doing list -> set -> list - all_verilog_files = list( - set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs)) + src_exts = [".v", ".sv"] + + all_verilog_src_files = list( + set( + filter( + lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs + ) + ) + ) + + verilog_header_dir = make_build_dir("pyverilator_vh_") + # use custom version of axis infrastructure vh + # to enable Verilator to simulate AMD/Xilinx components (e.g DWC) + custom_vh = pk.resource_filename( + "finn.qnn-data", "verilog/custom_axis_infrastructure.vh" ) + shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh") + for fn in all_verilog_srcs: + if fn.endswith(".vh"): + if "axis_infrastructure_v1_1_0.vh" in fn: + # skip, we use a custom version for this file without recursive gcd + continue + else: + shutil.copy(fn, verilog_header_dir) # remove all but one instances of regslice_core.v filtered_verilog_files = [] remove_entry = False - for vfile in all_verilog_files: + for vfile in all_verilog_src_files: if "regslice_core" in vfile: if not remove_entry: filtered_verilog_files.append(vfile) @@ -94,7 +118,12 @@ def pyverilate_stitched_ip( for vfile in filtered_verilog_files: with open(vfile) as rf: wf.write("//Added from " + vfile + "\n\n") - wf.write(rf.read()) + lines = rf.read() + for line in lines.split("\n"): + # break down too-long lines, Verilator complains otherwise + if len(line) > 20000: + line = line.replace("&", "\n&") + wf.write("\n" + line) verilator_args = [] # disable common verilator warnings that should be harmless but commonly occur @@ -108,10 +137,20 @@ def pyverilate_stitched_ip( # force inlining of all submodules to ensure we can read internal signals properly if read_internal_signals: verilator_args += ["--inline-mult", "0"] + # add defines to make certain XPM src files work with Verilator + verilator_args.append("-DDISABLE_XPM_ASSERTIONS") + verilator_args.append("-DOBSOLETE") + verilator_args.append("-DONESPIN") + verilator_args.append("--bbox-unsup") + vivado_path = os.environ["VIVADO_PATH"] + # additional SystemVerilog modules to make XPMs work with Verilator + xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv" + xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv" + xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv" sim = PyVerilator.build( - top_module_file_name, - verilog_path=[vivado_stitch_proj_dir], + [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc], + verilog_path=[vivado_stitch_proj_dir, verilog_header_dir], build_dir=build_dir, trace_depth=get_rtlsim_trace_depth(), top_module_name=top_module_name, diff --git a/src/finn/util/test.py b/src/finn/util/test.py index f5d3b1c30b8b7b439eae1c684ad84b33a3401c7c..bfe4aa0bb826c73f6a7c67f025e24764da8c36cc 100644 --- a/src/finn/util/test.py +++ b/src/finn/util/test.py @@ -180,6 +180,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=Fa sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] sdp_node = getCustomOp(sdp_node) sdp_node.set_nodeattr("model", child_path) + sdp_node.set_nodeattr("return_full_exec_context", 1 if return_full_ctx else 0) ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True) if return_full_ctx: return ret diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 103f18b514c23c4e1ad35a85d020dc0481aa9c47..5f787d1f889645d04884aed9b89a0b1c91d1f418 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -569,8 +569,8 @@ class TestEnd2End: for node in hls_layers: if node.op_type != "StreamingFIFO": op_inst = getCustomOp(node) - assert op_inst.get_nodeattr("inFIFODepth") == 0 - assert op_inst.get_nodeattr("outFIFODepth") == 0 + assert op_inst.get_nodeattr("inFIFODepths") == [0] + assert op_inst.get_nodeattr("outFIFODepths") == [0] model.save( get_checkpoint_name( topology, wbits, abits, QONNX_export, "fifodepth_" + kind diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py new file mode 100644 index 0000000000000000000000000000000000000000..5fd1439bd055782692bac404622137e166ef5e07 --- /dev/null +++ b/tests/fpgadataflow/test_fifosizing.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pytest + +import json +import shutil +from brevitas.export.onnx.generic.manager import BrevitasONNXManager + +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.util.basic import make_build_dir +from finn.util.test import get_trained_network_and_ishape + + +def fetch_test_model(topology, wbits=2, abits=2): + tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology) + (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) + chkpt_name = tmp_output_dir + "/model.onnx" + BrevitasONNXManager.export(model, ishape, chkpt_name) + return tmp_output_dir + + +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fifosizing_linear(): + tmp_output_dir = fetch_test_model("tfc") + cfg = build_cfg.DataflowBuildConfig( + output_dir=tmp_output_dir, + auto_fifo_depths=True, + auto_fifo_strategy="characterize", + target_fps=10000, + synth_clk_period_ns=10.0, + board="Pynq-Z1", + rtlsim_batch_size=100, + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + ], + default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, + ) + build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) + with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: + est_data = json.load(f) + with open(tmp_output_dir + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + assert ( + float(sim_data["throughput[images/s]"]) + / float(est_data["estimated_throughput_fps"]) + > 0.9 + ) + shutil.rmtree(tmp_output_dir) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 2e2da0da7a217091d76d0a59a2a36a8e6a28af8e..34928ce45be0fd96d27b153ae28e2128bf306bb5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -53,12 +53,11 @@ test_fpga_part = pynq_part_map[test_pynq_board] target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style): +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt): pad_h = padding[0] + padding[2] pad_w = padding[1] + padding[3] idim_h, idim_w = idim - assert pad_style == 2, "only pad_style == 2 supported in hlslib" assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim" odim_h = idim_h + pad_h odim_w = idim_w + pad_w @@ -80,7 +79,6 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty Padding=padding, NumChannels=num_ch, inputDataType=str(idt.name), - PaddingStyle=pad_style, numInputVectors=1, SIMD=simd, ) @@ -101,13 +99,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty # input image dimension @pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) # number of rows and number of cols to add -@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]]) +@pytest.mark.parametrize( + "pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]] +) # number of channels @pytest.mark.parametrize("num_ch", [2, 4]) # Input parallelism @pytest.mark.parametrize("simd", [1, 2]) -# PaddingStyle: selects behavior when (odim-idim)%2 != 0 -@pytest.mark.parametrize("pad_style", [2]) # FINN input datatype @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]]) # execution mode @@ -115,7 +113,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): +def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode): if num_ch % simd != 0: pytest.skip(" num_ch % simd != 0, skipping") @@ -123,19 +121,13 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] - if idim_h == idim_w and pad_h != pad_w: - pytest.skip( - """Only equal padding along the dimensions for square images - is supported, skipping""" - ) - # generate input data x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch]) input_dict = {"inp": x} odim_h = idim_h + pad_h odim_w = idim_w + pad_w - model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style) + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) @@ -150,26 +142,8 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): expected_oshape = (1, odim_h, odim_w, num_ch) assert y_produced.shape == expected_oshape - # calculate reference - # calculate correct pad according to parameters - if pad_style == 2: - if pad_h % 2 == 0: - pad_up = pad_h // 2 - else: - pad_up = pad_h // 2 + 1 - if pad_w % 2 == 0: - pad_left = pad_w // 2 - else: - pad_left = pad_w // 2 + 1 - else: - pad_up = pad_h // 2 - pad_left = pad_w // 2 - - pad_down = pad_h - pad_up - pad_right = pad_w - pad_left - y_expected = np.pad( - x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant" + x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant" ) assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index d1895a12675dce69070d280381a9982060e20c21..a7e7eba7ee8de81ec5eebe3e270e8e1d28564a00 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -42,6 +42,7 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP @@ -417,3 +418,67 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 + + +# mem_mode: const or decoupled +@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [8]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [8]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [32]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [32]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + for node in model.graph.node: + # lookup op_type in registry of CustomOps + inst = getCustomOp(node) + inst.set_nodeattr("mem_mode", mem_mode) + total_fold = nf * sf + exp_total_cycles = total_fold + 10 + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + model = model.transform(DeriveCharacteristic(exp_total_cycles)) + node_inst = getCustomOp(model.graph.node[0]) + period_attr = node_inst.get_nodeattr("io_chrc_period") + assert period_attr == exp_total_cycles + chrc_in = node_inst.get_nodeattr("io_chrc_in") + chrc_out = node_inst.get_nodeattr("io_chrc_out") + assert chrc_in.shape == (1, 2 * exp_total_cycles) + assert chrc_out.shape == (1, 2 * exp_total_cycles) + # first sf cycles should read input continuously + assert (chrc_in[0, :sf] == range(1, sf + 1)).all() + # all outputs should be produced within the exp n of cycles + assert chrc_out[0, exp_total_cycles] == nf diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index cdf69aebddc4d6af2288774acbff5dd8a52512b3..39f0b0dc89e9388c54a013becb53d9afbfb2ce4e 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -30,6 +30,7 @@ import pkg_resources as pk import pytest +import numpy as np import os from shutil import copytree @@ -55,7 +56,6 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/driver/driver.py") assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json") assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json") - assert os.path.isfile(output_dir + "/report/verify_rtlsim.vcd") assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd") assert os.path.isfile( output_dir + "/report/estimate_layer_config_alternatives.json" @@ -68,8 +68,19 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/report/post_synth_resources.xml") assert os.path.isfile(output_dir + "/report/post_route_timing.rpt") # verification outputs - verify_out_dir = output_dir + "/verification_output" - assert os.path.isfile(verify_out_dir + "/verify_initial_python_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_streamlined_python_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_folded_hls_cppsim_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_stitched_ip_rtlsim_SUCCESS.npy") + verif_batchsize = np.load(target_dir + "/input.npy").shape[0] + for i in range(verif_batchsize): + verify_out_dir = output_dir + "/verification_output" + assert os.path.isfile( + verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy" + ) + assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd")