diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index d842d89e234fd59f953a246293d271154d50954a..d3c4156d9b4ccf601d3eea348f6cb61c0d9a6e87 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -37,6 +37,13 @@ from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map +class AutoFIFOSizingMethod(str, Enum): + "Select the type of automatic FIFO sizing strategy." + + CHARACTERIZE = "characterize" + LARGEFIFO_RTLSIM = "largefifo_rtlsim" + + class ShellFlowType(str, Enum): """For builds that produce a bitfile, select the shell flow that will integrate the FINN-generated accelerator.""" @@ -246,6 +253,12 @@ class DataflowBuildConfig: #: for each FIFO. auto_fifo_depths: Optional[bool] = True + #: When `auto_fifo_depths = True`, select which method will be used for + #: setting the FIFO sizes. + auto_fifo_strategy: Optional[ + AutoFIFOSizingMethod + ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM + #: Memory resource type for large FIFOs #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 8290621056f9e4531693a3266bfb633735a4db33..5da608c27def8136f9ad11f62b4707452eac3120 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -79,6 +79,10 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( CreateDataflowPartition, ) from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.derive_characteristic import ( + DeriveCharacteristic, + DeriveFIFOSizes, +) from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO @@ -86,6 +90,7 @@ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) @@ -495,9 +500,9 @@ def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): """ Depending on the auto_fifo_depths setting, do one of the following: - * if auto_fifo_depths=True: Run the `InsertAndSetFIFODepths` transformation - to attempt to determine the FIFO sizes that provide full throughput. Involves - running stitched-IP rtlsim and may take a long time. + * if auto_fifo_depths=True: Run the appropriate auto-sizing transformation + to attempt to determine the FIFO sizes that provide full throughput. + May take a long time. * if auto_fifo_depths=False: Assume the folding config file contains FIFO sizes as well. Runs the `InsertFIFO` transformation, then `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`. @@ -506,13 +511,35 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): """ if cfg.auto_fifo_depths: - model = model.transform( - InsertAndSetFIFODepths( - cfg._resolve_fpga_part(), - cfg._resolve_hls_clk_period(), - vivado_ram_style=cfg.large_fifo_mem_style, + if cfg.auto_fifo_strategy == "characterize": + model = model.transform(InsertDWC()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) ) - ) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + model = model.transform(AnnotateCycles()) + period = model.analysis(dataflow_performance)["max_cycles"] + 10 + model = model.transform(DeriveCharacteristic(period)) + model = model.transform(DeriveFIFOSizes()) + model = model.transform( + InsertFIFO( + vivado_ram_style=cfg.large_fifo_mem_style, max_qsrl_depth=256 + ) + ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + elif cfg.auto_fifo_strategy == "largefifo_rtlsim": + model = model.transform( + InsertAndSetFIFODepths( + cfg._resolve_fpga_part(), + cfg._resolve_hls_clk_period(), + vivado_ram_style=cfg.large_fifo_mem_style, + ) + ) + else: + assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy else: # assume folding cfg json contains FIFO sizes too # insert DWCs, FIFOs and run ApplyConfig once more diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py index 13a4c5892c8f82c37e1794057a06217981a6a580..cd0af6b3ab3d8250abbf7d48e004622e55f09f04 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py @@ -42,18 +42,21 @@ class AddStreams_Batch(HLSCustomOp): super().__init__(onnx_node) def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) return my_attrs def get_normal_input_shape(self, ind=0): @@ -70,10 +73,10 @@ class AddStreams_Batch(HLSCustomOp): ishape = tuple(vecs + [ich // pe, pe]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() def make_shape_compatible_op(self, model): @@ -124,11 +127,11 @@ class AddStreams_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # we need to set output datatype to the next larger int or uint # enhancement: consider specifying w/ explicit outputDataType attribute @@ -139,14 +142,14 @@ class AddStreams_Batch(HLSCustomOp): else: return DataType.get_smallest_possible(2 * idt.max()) - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") @@ -357,3 +360,14 @@ class AddStreams_Batch(HLSCustomOp): swidth = self.get_instream_width_padded() intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + "in1": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py index 3ed76db2982e411b711be5bd78e39dd866332714..46adca680d3c96695eeb5a91be53ea158fc78f1f 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -102,9 +102,6 @@ class ChannelwiseOp_Batch(HLSCustomOp): "inputDataType": ("s", True, ""), "paramDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - # input and output FIFO depths - "inFIFODepth": ("i", False, 0), - "outFIFODepth": ("i", False, 0), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -221,23 +218,23 @@ class ChannelwiseOp_Batch(HLSCustomOp): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") fold = ich // pe @@ -245,17 +242,17 @@ class ChannelwiseOp_Batch(HLSCustomOp): folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # same shape as input return self.get_folded_input_shape() - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [ich]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py index bde285eb0dd1b3818926c1feb7ac8d5de69a4be6..c927c07df21faf40ccbf9ddbe47e3f2f2ca61c89 100644 --- a/src/finn/custom_op/fpgadataflow/checksum.py +++ b/src/finn/custom_op/fpgadataflow/checksum.py @@ -77,31 +77,31 @@ class CheckSum(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # here same as input data type return DataType[self.get_nodeattr("inputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): dtype = DataType[self.get_nodeattr("inputDataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): return self.get_instream_width() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): # derive normal shape from folded shape # checksum nodes are inserted in between fpgadataflow nodes # the folded shape could be for example (1, nf, pe) @@ -127,7 +127,7 @@ class CheckSum(HLSCustomOp): def get_ap_int_max_w(self): return max(super().get_ap_int_max_w(), 32) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py index 5fcf9cf96cbacd4e444af0b90618a19eefb9bfe2..4437bcd1984c5194b0a19b43d692babb7e3cd158 100644 --- a/src/finn/custom_op/fpgadataflow/concat.py +++ b/src/finn/custom_op/fpgadataflow/concat.py @@ -74,12 +74,12 @@ class StreamingConcat(HLSCustomOp): def get_folded_input_shape(self, ind=0): return self.get_normal_input_shape(ind) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): total_elems = self.get_total_elems() vecs = list(self.get_nodeattr("numInputVectors")) return tuple(vecs + [total_elems]) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_normal_output_shape() def make_shape_compatible_op(self, model): @@ -106,7 +106,7 @@ class StreamingConcat(HLSCustomOp): # input dt identical for all inputs return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): return self.get_input_datatype() def get_instream_width(self, ind=0): @@ -115,7 +115,7 @@ class StreamingConcat(HLSCustomOp): ibits = self.get_input_datatype().bitwidth() return elems * ibits - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() total_elems = self.get_total_elems() out_width = total_elems * obits diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 251a9882c58a3cf94449701795b72c8a6adab318..1566445999a2c568b5c5a112d436bf05fd89aca5 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -99,13 +99,13 @@ class ConvolutionInputGenerator(HLSCustomOp): assert ret[0] == ret[1] == 1, "Only dilation=1 supported" return ret - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -114,7 +114,7 @@ class ConvolutionInputGenerator(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -126,7 +126,7 @@ class ConvolutionInputGenerator(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -158,15 +158,15 @@ class ConvolutionInputGenerator(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns stream width, input and output stream width are equal for the sliding window function""" ibits = self.get_input_datatype().bitwidth() @@ -176,7 +176,7 @@ class ConvolutionInputGenerator(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns stream width, input and output stream width are equal for the sliding window function, so the function to determine the input stream width can be reused.""" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py index aba74baecc0f40571fa288459a04ad42e167ccf6..f1c84662cc06e89df5bd7c0762ac47b8c5723502 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py @@ -91,13 +91,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -106,7 +106,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -118,7 +118,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -153,15 +153,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -169,7 +169,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.use_parallel_window_output(): # feed all window pixels in parallel k_h, k_w = self.get_nodeattr("ConvKernelDim") diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py index 399b36e15021af6f449df3e9ba2acdc699a27647..5424050a8ed0a353894721d5bba28c1d45e62771 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -98,13 +98,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -113,7 +113,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -125,7 +125,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -160,15 +160,15 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -176,7 +176,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.get_nodeattr("parallel_window"): # feed all window pixels in parallel k_h, k_w = self.get_nodeattr("ConvKernelDim") diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index e9009e1856a2b379911969a69d258163e67c1197..b7efaff440dd5cc2160fbfb8050b30924460ffe6 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -79,7 +79,7 @@ class DownSampler(HLSCustomOp): exp_cycles = channels / simd * batch_size * idim_total return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): is_1D = self.get_nodeattr("is1D") is_1D_unitx = self.get_nodeattr("is1D_unitx") idim = self.get_nodeattr("ImgDim") @@ -94,7 +94,7 @@ class DownSampler(HLSCustomOp): ishape = (batch, idim, idim, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): is_1D = self.get_nodeattr("is1D") is_1D_unitx = self.get_nodeattr("is1D_unitx") odim = self.get_downsampled_odim() @@ -109,7 +109,7 @@ class DownSampler(HLSCustomOp): oshape = (batch, odim, odim, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -118,7 +118,7 @@ class DownSampler(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, simd] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -151,21 +151,21 @@ class DownSampler(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return ibits * simd - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return obits * simd diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index 04ca45e7f1c1844a9976d46392be46f6cffc2167..93cde15ca7d42dbed12417837916359fdcc71b67 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -61,13 +61,13 @@ class DuplicateStreams_Batch(HLSCustomOp): def get_num_output_streams(self): return self.get_nodeattr("NumOutputStreams") - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [ch]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -138,22 +138,22 @@ class DuplicateStreams_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("inputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") @@ -408,3 +408,13 @@ class DuplicateStreams_Batch(HLSCustomOp): ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) ) return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out0": [], "out1": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py index a29e871fabbc01f0accd6858d69c0a96a5a8c495..a7b9c814e274e3df87fdcbc04ec9ca36ba1076e0 100644 --- a/src/finn/custom_op/fpgadataflow/eltwise.py +++ b/src/finn/custom_op/fpgadataflow/eltwise.py @@ -91,10 +91,10 @@ class StreamingEltwise(HLSCustomOp): ishape = tuple(vecs + [ich // pe, pe]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() def make_shape_compatible_op(self, model): @@ -156,11 +156,11 @@ class StreamingEltwise(HLSCustomOp): return info_messages - def get_input_datatype(self, id=0): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType" + str(id))] + return DataType[self.get_nodeattr("inputDataType" + str(ind))] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" op = self.get_nodeattr("eltwiseOp") idt0 = self.get_input_datatype(0) @@ -196,7 +196,7 @@ class StreamingEltwise(HLSCustomOp): in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py index d69ea471ea8ae1d58f97d056936b505cc2a2806b..50eaaff94bfe8a0530e17044f7a29dfc8908b6cc 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -90,20 +90,20 @@ class FMPadding_Batch(HLSCustomOp): exp_cycles = (channels / simd) * batch_size * odim_h * odim_w return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): idim_h, idim_w = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") ishape = (1, idim_h, idim_w, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): odim_h, odim_w = self.get_padded_odim() num_ch = self.get_nodeattr("NumChannels") oshape = (1, odim_h, odim_w, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -112,7 +112,7 @@ class FMPadding_Batch(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, simd] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -144,7 +144,7 @@ class FMPadding_Batch(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] # the hlslib op always pads with zeros, so ensure that the DataType @@ -152,16 +152,16 @@ class FMPadding_Batch(HLSCustomOp): assert ret.allowed(0), "FMPadding_Batch DataType must support zero" return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return ibits * simd - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return obits * simd diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py index adafa7dcf36111e63fa49e0d184594fff54be99d..e7fa5bc0048b54a32ebc61482b96009fa019809e 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py @@ -56,13 +56,13 @@ class GlobalAccPool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [ch]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -71,7 +71,7 @@ class GlobalAccPool_Batch(HLSCustomOp): folded_ishape = tuple(vecs + [folds, pe]) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) if len(vecs) == 1: @@ -80,7 +80,7 @@ class GlobalAccPool_Batch(HLSCustomOp): oshape = tuple([vecs[0]] + [1, 1, ch]) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") unfolded_shape = list(self.get_normal_output_shape()) @@ -139,11 +139,11 @@ class GlobalAccPool_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # determine data type from image size and input type idt = DataType[self.get_nodeattr("inputDataType")] @@ -155,14 +155,14 @@ class GlobalAccPool_Batch(HLSCustomOp): extreme_value = npixels * idt.max() return DataType.get_smallest_possible(extreme_value) - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index c5041acd46a63880160f7726946e1c609642710d..f307be95c30d822dfc517e4c331bd8d82d727997 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -29,8 +29,9 @@ import numpy as np import os import subprocess +import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -107,10 +108,18 @@ class HLSCustomOp(CustomOp): # ID of FPGA device to which this Op is allocated, in # a multi-FPGA setting "device_id": ("i", False, 0), - # input and output FIFO depths - "inFIFODepth": ("i", False, 2), - "outFIFODepth": ("i", False, 2), + # input and output FIFO depths for multi-I/O nodes + "inFIFODepths": ("ints", False, [2]), + "outFIFODepths": ("ints", False, [2]), "output_hook": ("s", False, ""), + # accumulated characteristic function over two periods + "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)), + "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)), + # the period for which the characterization was run + "io_chrc_period": ("i", False, 0), + # amount of zero padding inserted during chrc. + "io_chrc_pads_in": ("ints", False, []), + "io_chrc_pads_out": ("ints", False, []), } def get_verilog_top_module_name(self): @@ -688,40 +697,48 @@ compilation transformations? HLSCustomOp class but has to be filled by every node.""" pass - def get_normal_input_shape(self): + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input stream ind.""" + raise Exception("get_input_datatype not implemented for this op") + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output stream ind.""" + raise Exception("get_output_datatype not implemented for this op") + + def get_normal_input_shape(self, ind=0): """Returns normal input shape if implemented.""" raise Exception("get_normal_input_shape not implemented for this op") - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): """Returns folded output shape if implemented.""" raise Exception("get_normal_output_shape not implemented for this op") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): """Returns folded input shape (according to synapse folding), if implemented.""" raise Exception("get_folded_input_shape not implemented for this op") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): """Returns folded output shape (according to neuron folding), if implemented.""" raise Exception("get_folded_output_shape not implemented for this op") - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width, if implemented.""" raise Exception("get_instream_width not implemented for this op") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width, if implemented.""" raise Exception("get_outstream_width not implemented for this op") - def get_instream_width_padded(self): + def get_instream_width_padded(self, ind=0): """Returns input stream width padded to a multiple of 8. This is required by the AXI Stream spec.""" - in_width = self.get_instream_width() + in_width = self.get_instream_width(ind=ind) return roundup_to_integer_multiple(in_width, 8) - def get_outstream_width_padded(self): + def get_outstream_width_padded(self, ind=0): """Returns output stream width padded to a multiple of 8. This is required by the AXI Stream spec.""" - out_width = self.get_outstream_width() + out_width = self.get_outstream_width(ind=ind) return roundup_to_integer_multiple(out_width, 8) def get_ap_int_max_w(self): @@ -734,3 +751,119 @@ compilation transformations? "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret ) return ret + + def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): + """Return the unconstrained characteristic functions for this node.""" + # ensure rtlsim is ready + assert self.get_nodeattr("rtlsim_so") != "", ( + "rtlsim not ready for " + self.onnx_node.name + ) + if self.get_nodeattr("io_chrc_period") > 0: + warnings.warn( + "Skipping node %s: already has FIFO characteristic" + % self.onnx_node.name + ) + return + exp_cycles = self.get_exp_cycles() + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + n_outs = np.prod(self.get_folded_output_shape()[:-1]) + if exp_cycles == 0: + # try to come up with an optimistic estimate + exp_cycles = min(n_inps, n_outs) + assert ( + exp_cycles <= period + ), "Period %d too short to characterize %s : expects min %d cycles" % ( + period, + self.onnx_node.name, + exp_cycles, + ) + sim = self.get_rtlsim() + # signal name + sname = "_" + self.hls_sname() + "_" + if override_rtlsim_dict is not None: + io_dict = override_rtlsim_dict + else: + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + + # extra dicts to keep track of cycle-by-cycle transaction behavior + # note that we restrict key names to filter out weight streams etc + txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} + txns_out = { + key: [] for (key, value) in io_dict["outputs"].items() if "out" in key + } + + def monitor_txns(sim_obj): + for inp in txns_in: + in_ready = _read_signal(sim, inp + sname + "TREADY") == 1 + in_valid = _read_signal(sim, inp + sname + "TVALID") == 1 + if in_ready and in_valid: + txns_in[inp].append(1) + else: + txns_in[inp].append(0) + for outp in txns_out: + if ( + _read_signal(sim, outp + sname + "TREADY") == 1 + and _read_signal(sim, outp + sname + "TVALID") == 1 + ): + txns_out[outp].append(1) + else: + txns_out[outp].append(0) + + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + n_outs, + sname=sname, + liveness_threshold=period, + hook_preclk=monitor_txns, + ) + assert ( + total_cycle_count <= period + ), """Total cycle count from rtl simulation is higher than + specified period, please set the period higher than {}""".format( + total_cycle_count + ) + self.set_nodeattr("io_chrc_period", period) + + def accumulate_char_fxn(chrc): + p = len(chrc) + ret = [] + for t in range(2 * p): + if t == 0: + ret.append(chrc[0]) + else: + ret.append(ret[-1] + chrc[t % p]) + return np.asarray(ret, dtype=np.int32) + + all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32) + all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) + all_pad_in = [] + all_pad_out = [] + for in_idx, in_strm_nm in enumerate(txns_in.keys()): + txn_in = txns_in[in_strm_nm] + if len(txn_in) < period: + pad_in = period - len(txn_in) + txn_in += [0 for x in range(pad_in)] + txn_in = accumulate_char_fxn(txn_in) + all_txns_in[in_idx, :] = txn_in + all_pad_in.append(pad_in) + + for out_idx, out_strm_nm in enumerate(txns_out.keys()): + txn_out = txns_out[out_strm_nm] + if len(txn_out) < period: + pad_out = period - len(txn_out) + txn_out += [0 for x in range(pad_out)] + txn_out = accumulate_char_fxn(txn_out) + all_txns_out[out_idx, :] = txn_out + all_pad_out.append(pad_out) + + self.set_nodeattr("io_chrc_in", all_txns_in) + self.set_nodeattr("io_chrc_out", all_txns_out) + self.set_nodeattr("io_chrc_pads_in", all_pad_in) + self.set_nodeattr("io_chrc_pads_out", all_pad_out) diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index 33ee1d359c7b82494e1b5ce1b83aa5d0199f8153..65683079fc6a648de31148e398ea498f38b8d3d9 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -100,16 +100,16 @@ class IODMA(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): vecs = list(self.get_nodeattr("numInputVectors")) num_ch = self.get_nodeattr("NumChannels") ishape = tuple(vecs + [num_ch]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): if self.get_nodeattr("direction") == "in": raise ValueError("Folded input shape not defined for input IODMA") else: @@ -126,7 +126,7 @@ class IODMA(HLSCustomOp): shape.append(elems_per_word) return tuple(shape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): if self.get_nodeattr("direction") == "out": raise ValueError("Folded output shape not defined for output IODMA") else: @@ -166,15 +166,15 @@ class IODMA(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): if self.get_nodeattr("direction") == "in": return self.get_nodeattr("intfWidth") elif self.get_nodeattr("direction") == "out": @@ -182,7 +182,7 @@ class IODMA(HLSCustomOp): else: raise ValueError("Invalid IODMA direction, please set to in or out") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.get_nodeattr("direction") == "out": return self.get_nodeattr("intfWidth") elif self.get_nodeattr("direction") == "in": diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py index 3e27ee01113392174c1206fc10e1c9abe82fdfe7..03f89bd7ecac69a9097f4f35c42bd528be709515 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py +++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py @@ -70,13 +70,13 @@ class LabelSelect_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): nlabels = self.get_nodeattr("Labels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [nlabels]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): nlabels = self.get_nodeattr("Labels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -85,13 +85,13 @@ class LabelSelect_Batch(HLSCustomOp): folded_ishape = tuple(vecs + [folds, pe]) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k = self.get_nodeattr("K") vecs = list(self.get_nodeattr("numInputVectors")) oshape = tuple(vecs + [k]) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k = self.get_nodeattr("K") vecs = list(self.get_nodeattr("numInputVectors")) oshape = tuple(vecs + [k, 1]) @@ -152,24 +152,24 @@ class LabelSelect_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" ret = DataType[self.get_nodeattr("outputDataType")] return ret - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" return self.get_output_datatype().bitwidth() diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index 613a91b6284e0789dff2446e1615690a03336d99..fd3e2b5b1cfa74eb4f957df4b568e6c46da47617 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -75,21 +75,21 @@ class Lookup(HLSCustomOp): exp_cycles = int(n_inputs) return exp_cycles - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): return self.get_nodeattr("InputShape") - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ishape = self.get_normal_input_shape() emb_dim = self.get_nodeattr("EmbeddingDim") oshape = list(ishape) + [emb_dim] return tuple(oshape) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ishape = self.get_normal_input_shape() folded_ishape = list(ishape) + [1] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ishape = self.get_normal_input_shape() mem_mode = self.get_nodeattr("mem_mode") emb_dim = self.get_nodeattr("EmbeddingDim") @@ -135,19 +135,19 @@ class Lookup(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): ret = DataType[self.get_nodeattr("InputType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): ret = DataType[self.get_nodeattr("EmbeddingType")] return ret - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() return ibits - def get_outstream_width(self): + def get_outstream_width(self, ind=0): folded_oshape = self.get_folded_output_shape() obits = self.get_output_datatype().bitwidth() return obits * folded_oshape[-1] diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 9d2717dc8c65ddb5329816880067b81b10db2c02..69763fbea8a6079c7b0a61e14da37a3af69dfdfb 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -409,16 +409,16 @@ class MatrixVectorActivation(HLSCustomOp): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() in_width = i_bits * self.get_nodeattr("SIMD") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width @@ -474,7 +474,7 @@ class MatrixVectorActivation(HLSCustomOp): return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): mh = self.get_nodeattr("MH") pe = self.get_nodeattr("PE") nf = mh // pe @@ -482,13 +482,13 @@ class MatrixVectorActivation(HLSCustomOp): folded_output_shape = tuple(vecs + [nf, pe]) return folded_output_shape - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): mw = self.get_nodeattr("MW") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [mw]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): mh = self.get_nodeattr("MH") vecs = list(self.get_nodeattr("numInputVectors")) normal_output_shape = tuple(vecs + [mh]) @@ -1227,8 +1227,11 @@ class MatrixVectorActivation(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() ) - in_fifo_depth = self.get_nodeattr("inFIFODepth") - out_fifo_depth = self.get_nodeattr("outFIFODepth") + # TODO can we deprecate this entirely? this looks like legacy code + # that does not really serve a purpose - FIFO sizes are not typically + # allocated at this point; at best they are set to 2 as the default + in_fifo_depth = 2 + out_fifo_depth = 2 # insert depth pragmas only if specified if in_fifo_depth != 0: self.code_gen_dict["$PRAGMAS$"].append( @@ -1462,3 +1465,20 @@ class MatrixVectorActivation(HLSCustomOp): thres_count = out_features ret_dict[thres_param_type] = thres_count return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py index 3bf187fa9a78ed2c812f042a29079ee1e3163d74..91cd537baeff0c7666bbf3596b46a7412ec2fe4e 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -74,11 +74,11 @@ class Pool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("InputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" fxn = self.get_nodeattr("Function") odt = DataType[self.get_nodeattr("OutputDataType")] @@ -98,7 +98,7 @@ class Pool_Batch(HLSCustomOp): return odt - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_ch = self.get_nodeattr("Channels") odims = self.get_nodeattr("OutImgDims") batch_size = self.get_nodeattr("BatchSize") @@ -107,7 +107,7 @@ class Pool_Batch(HLSCustomOp): ishape = (batch_size, *odims, k_prod * ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -116,14 +116,14 @@ class Pool_Batch(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, pe] return tuple(folded_ishape) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ofm_ch = self.get_nodeattr("Channels") odims = self.get_nodeattr("OutImgDims") batch_size = self.get_nodeattr("BatchSize") oshape = (batch_size, *odims, ofm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -147,13 +147,13 @@ class Pool_Batch(HLSCustomOp): exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size return int(exp_cycles) - def get_instream_width(self): + def get_instream_width(self, ind=0): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = int(dt_bits * pe) return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): dt_bits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") out_width = int(dt_bits * pe) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py index 1e6b72e4d54ede639e797f32f51fb7705ec8ce4b..a3aa9d570d0efcbe82090d19a151d4f5b12078b6 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py @@ -60,19 +60,19 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("dataType")] - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ishape = self.get_nodeattr("shape") return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): oshape = self.get_nodeattr("shape") return oshape @@ -97,7 +97,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): Please adjust PE and SIMD values so that OutWidth % InWidth = 0 or alternatively use impl_style = vivado""" - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): self.check_divisible_iowidths() iwidth = self.get_nodeattr("inWidth") ishape = self.get_normal_input_shape() @@ -117,7 +117,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): dummy_t = dummy_t.reshape(new_shape) return dummy_t.shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): self.check_divisible_iowidths() owidth = self.get_nodeattr("outWidth") oshape = self.get_normal_output_shape() @@ -142,11 +142,11 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def get_instream_width(self): + def get_instream_width(self, ind=0): in_width = self.get_nodeattr("inWidth") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): out_width = self.get_nodeattr("outWidth") return out_width diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index a0346f50bf6b7e88a79ba5ef4700039eb39c32ef..40d016de43820a37e8c7894a3e1f30146c667e59 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -75,6 +75,22 @@ class StreamingFIFO(HLSCustomOp): return my_attrs + def get_adjusted_depth(self): + impl = self.get_nodeattr("impl_style") + depth = self.get_nodeattr("depth") + if impl == "vivado": + old_depth = depth + # round up depth to nearest power-of-2 + # Vivado FIFO impl may fail otherwise + depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth + if old_depth != depth: + warnings.warn( + "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" + % (self.onnx_node.name, old_depth, depth) + ) + + return depth + def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() oshape = self.get_normal_output_shape() @@ -190,10 +206,8 @@ class StreamingFIFO(HLSCustomOp): self.set_nodeattr("ip_vlnv", vlnv) self.code_gen_dict.clear() - def get_normal_input_shape(self): - depth = self.get_nodeattr("depth") - # depth has to be between 2 and 256 with the current - # StreamingFIFO implementation + def get_normal_input_shape(self, ind=0): + depth = self.get_adjusted_depth() assert depth >= 2, """Depth is too low""" if depth > 256 and self.get_nodeattr("impl_style") == "rtl": warnings.warn( @@ -221,22 +235,22 @@ class StreamingFIFO(HLSCustomOp): return normal_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_instream_width(self): + def get_instream_width(self, ind=0): dtype = DataType[self.get_nodeattr("dataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): dtype = DataType[self.get_nodeattr("dataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() @@ -338,7 +352,7 @@ class StreamingFIFO(HLSCustomOp): elif impl_style == "vivado": cmd = [] node_name = self.onnx_node.name - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() ram_style = self.get_nodeattr("ram_style") # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] @@ -403,7 +417,7 @@ class StreamingFIFO(HLSCustomOp): """Calculates resource estimation for BRAM""" impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() if impl == "rtl" or (impl == "vivado" and ram_type != "block"): @@ -428,7 +442,7 @@ class StreamingFIFO(HLSCustomOp): impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() if impl == "rtl" or (impl == "vivado" and ram_type != "ultra"): @@ -438,7 +452,7 @@ class StreamingFIFO(HLSCustomOp): return (math.ceil(depth / 4096)) * (math.ceil(W / 72)) def bram_efficiency_estimation(self): - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() bram16_est = self.bram_estimation() if bram16_est == 0: @@ -451,7 +465,7 @@ class StreamingFIFO(HLSCustomOp): """Calculates resource estimations for LUTs""" impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() address_luts = 2 * math.ceil(math.log(depth, 2)) diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py index 882b40a0aaf542e6dcaf427ca3567ae78394ede5..a0e60931edd8590aaebc0560c4bd28d61d62e8ea 100755 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py @@ -57,11 +57,11 @@ class StreamingMaxPool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("dataType")] @@ -82,13 +82,13 @@ class StreamingMaxPool_Batch(HLSCustomOp): ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() return (ifm_dim[0] == 1) and (k[0] == 1) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") ifm_ch = self.get_nodeattr("NumChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") ifm_ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") @@ -99,7 +99,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") k_h, k_w = tuple(self.get_nodeattr("PoolDim")) ifm_ch = self.get_nodeattr("NumChannels") @@ -116,7 +116,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # even though there is no folding in the current hlslib op, # insert a time multiplexing axis to remain compatible with the # shapes produced by the rest of the dataflow pipeline @@ -155,7 +155,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): # TODO: adjust inaccurate formula return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) - def get_instream_width(self): + def get_instream_width(self, ind=0): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") ifm_ch = self.get_nodeattr("NumChannels") @@ -165,7 +165,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): in_width = int(dt_bits * ifm_ch) return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """For streaming maxpool out stream width is the same as in stream width""" return self.get_instream_width() diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index 5383cc1f4bdf9eb88c7d7bd69c25231282f11c6f..f2cc64668d62ef15446772309577e9b15a378ef5 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -75,9 +75,6 @@ class Thresholding_Batch(HLSCustomOp): "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - # input and output FIFO depths - "inFIFODepth": ("i", False, 0), - "outFIFODepth": ("i", False, 0), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -185,11 +182,11 @@ class Thresholding_Batch(HLSCustomOp): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] @@ -221,11 +218,11 @@ class Thresholding_Batch(HLSCustomOp): self.set_nodeattr("weightDataType", tdt.name) return DataType[self.get_nodeattr("weightDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") @@ -251,7 +248,7 @@ class Thresholding_Batch(HLSCustomOp): weightstream = self.get_weightstream_width() return max([weightstream, temp_value]) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") fold = ich // pe @@ -259,17 +256,17 @@ class Thresholding_Batch(HLSCustomOp): folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # same shape as input return self.get_folded_input_shape() - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [ich]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() @@ -960,3 +957,20 @@ class Thresholding_Batch(HLSCustomOp): "Return a list of extra tcl directives for HLS synthesis." return ["config_compile -pipeline_style frp"] + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_tmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 7386aa7e6311754b653e94f8d2e9b2a910a1370b..1bd32442a1986d6a86571e85a09322d6c15d8a78 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -218,21 +218,21 @@ class TLastMarker(HLSCustomOp): def get_number_output_values(self): return self.get_nodeattr("NumIters") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") elem_width = self.get_nodeattr("ElemWidth") n_packed_elems = stream_width // elem_width n_iters = self.get_nodeattr("NumIters") return (1, n_iters, n_packed_elems) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() - def get_instream_width(self): + def get_instream_width(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") return stream_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") return stream_width diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index eb51fe39fc6e7ec84204f9d541a0e47c333bbf43..a018fd35aac4d63b365e97464dab0fd4a5fa13f2 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -73,7 +73,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): exp_cycles = OFMDim * reps return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): IFMDim = self.get_nodeattr("IFMDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") @@ -84,7 +84,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): ishape = (batch, IFMDim, 1, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): OFMDim = self.get_nodeattr("OFMDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") @@ -95,11 +95,11 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): oshape = (batch, OFMDim, 1, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) return tuple(normal_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) return tuple(normal_oshape) @@ -129,21 +129,21 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() ifm_ch = self.get_nodeattr("NumChannels") return ibits * ifm_ch - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() ifm_ch = self.get_nodeattr("NumChannels") return obits * ifm_ch diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index b0c05d1ad6c74ceaaaa2c932f4add3f0076bda51..0375bdea68f6c10eda8a3c5f375bbb14bc9a2be5 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -208,7 +208,7 @@ class VectorVectorActivation(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] @@ -216,16 +216,16 @@ class VectorVectorActivation(HLSCustomOp): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() in_width = i_bits * self.get_nodeattr("PE") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width @@ -249,7 +249,7 @@ class VectorVectorActivation(HLSCustomOp): return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe @@ -257,14 +257,14 @@ class VectorVectorActivation(HLSCustomOp): folded_output_shape = tuple([1, dim_h, dim_w, nf, pe]) return folded_output_shape - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") k_h, k_w = self.get_nodeattr("Kernel") normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ch = self.get_nodeattr("Channels") dim_h, dim_w = self.get_nodeattr("Dim") normal_output_shape = tuple([1, dim_h, dim_w, ch]) @@ -901,8 +901,11 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() ) - in_fifo_depth = self.get_nodeattr("inFIFODepth") - out_fifo_depth = self.get_nodeattr("outFIFODepth") + # TODO can we deprecate this entirely? this looks like legacy code + # that does not really serve a purpose - FIFO sizes are not typically + # allocated at this point; at best they are set to 2 as the default + in_fifo_depth = 2 + out_fifo_depth = 2 # insert depth pragmas only if specified if in_fifo_depth != 0: self.code_gen_dict["$PRAGMAS$"].append( @@ -1254,3 +1257,20 @@ class VectorVectorActivation(HLSCustomOp): thres_count = fm ret_dict[thres_param_type] = thres_count return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/qnn-data/testcase/residual_testcase.onnx b/src/finn/qnn-data/testcase/residual_testcase.onnx new file mode 100644 index 0000000000000000000000000000000000000000..c96e8c694e3a39cdb9e5d984e1c069ceb55b3f2a Binary files /dev/null and b/src/finn/qnn-data/testcase/residual_testcase.onnx differ diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index b7db49eb22e0ccb6e3ffbf8ccad44d4274cb2154..7e4ab34af79c52a08e737f57b2fc8f017940bcf5 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1282,6 +1282,7 @@ class InferDuplicateStreamsLayer(Transformation): inputDataType=dt.name, numInputVectors=vecs, NumOutputStreams=n_outputs, + outFIFODepths=[2] * n_outputs, name="DuplicateStreams_Batch_" + node.name, ) @@ -1709,6 +1710,7 @@ class InferConcatLayer(Transformation): ElemsPerStream=elems_per_stream, inputDataType=dt0.name, numInputVectors=inp_vec, + inFIFODepths=[2] * len(node.input), ) graph.node.insert(node_ind, new_node) # remove old node diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 00e2cc3bb48bcb8b81ba4750382178a4e508bec6..2fd7c32142d3e6dc4a9694aa6cb183d3f50fdea3 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -582,6 +582,10 @@ class CreateStitchedIP(Transformation): if os.path.isfile(wrapper_filename_alt): model.set_metadata_prop("wrapper_filename", wrapper_filename_alt) else: - raise Exception("CreateStitchedIP failed, no wrapper HDL found.") + raise Exception( + """CreateStitchedIP failed, no wrapper HDL found under %s or %s. + Please check logs under the parent directory.""" + % (wrapper_filename, wrapper_filename_alt) + ) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py new file mode 100644 index 0000000000000000000000000000000000000000..822679721036c7832241db4642911ff804fb9dff --- /dev/null +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -0,0 +1,190 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import qonnx.custom_op.registry as registry +import warnings +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.base import NodeLocalTransformation + +from finn.util.fpgadataflow import is_fpgadataflow_node + + +class DeriveCharacteristic(NodeLocalTransformation): + """For each node in the graph, run rtlsim to obtain the i/o + characteristic function for FIFO sizing and set the attribute. + It is assumed that the PrepareRTLSim transformation was already + called on the graph. + + This transformation performs rtlsim for each node, so it will run for + some time (minutes to hours depending on configuration). + + * period (int) desired period over which the characteristic function + will be derived. + + * num_workers (int or None) number of parallel workers, see documentation in + NodeLocalTransformation for more details. + """ + + def __init__(self, period, num_workers=None, manual_bypass=False): + super().__init__(num_workers=num_workers) + self.period = period + self.manual_bypass = manual_bypass + + def applyNodeLocal(self, node): + op_type = node.op_type + if is_fpgadataflow_node(node) is True: + try: + # lookup op_type in registry of CustomOps + inst = registry.getCustomOp(node) + inst.derive_characteristic_fxns(period=self.period) + except KeyError: + # exception if op_type is not supported + raise Exception( + "Custom op_type %s is currently not supported." % op_type + ) + return (node, False) + + def apply(self, model: ModelWrapper): + (model, run_again) = super().apply(model) + if not self.manual_bypass: + return (model, run_again) + # apply manual fix for DuplicateStreams and AddStreams for + # simple residual reconvergent paths with bypass + addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + for addstrm_node in addstrm_nodes: + # we currently only support the case where one branch is + # a bypass + b0 = model.find_producer(addstrm_node.input[0]) + b1 = model.find_producer(addstrm_node.input[1]) + if (b0 is None) or (b1 is None): + warnings.warn("Found unsupported AddStreams, skipping") + return (model, run_again) + b0_is_bypass = b0.op_type == "DuplicateStreams_Batch" + b1_is_bypass = b1.op_type == "DuplicateStreams_Batch" + if (not b0_is_bypass) and (not b1_is_bypass): + warnings.warn("Found unsupported AddStreams, skipping") + return (model, run_again) + ds_node = b0 if b0_is_bypass else b1 + comp_branch_last = b1 if b0_is_bypass else b0 + + ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1] + comp_branch_first = model.find_consumer(ds_comp_bout) + if comp_branch_first is None or comp_branch_last is None: + warnings.warn("Found unsupported DuplicateStreams, skipping") + return (model, run_again) + comp_branch_last = registry.getCustomOp(comp_branch_last) + comp_branch_first = registry.getCustomOp(comp_branch_first) + # for DuplicateStreams, use comp_branch_first's input characterization + # for AddStreams, use comp_branch_last's output characterization + period = comp_branch_first.get_nodeattr("io_chrc_period") + comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[ + : 2 * period + ] + comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[ + 2 * period : + ] + ds_node_inst = registry.getCustomOp(ds_node) + addstrm_node_inst = registry.getCustomOp(addstrm_node) + ds_node_inst.set_nodeattr("io_chrc_period", period) + ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2) + addstrm_node_inst.set_nodeattr("io_chrc_period", period) + addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2) + warnings.warn( + f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}" + ) + warnings.warn( + f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}" + ) + return (model, run_again) + + +class DeriveFIFOSizes(NodeLocalTransformation): + """Prerequisite: DeriveCharacteristic already called on graph. + For each node in the graph, use the accumulated I/O characteristic function + to perform FIFO sizing, setting the in/outFIFODepth attributes of HLSCustomOp + nodes. + + * num_workers (int or None) number of parallel workers, see documentation in + NodeLocalTransformation for more details. + """ + + def __init__(self, num_workers=None): + super().__init__(num_workers=num_workers) + + def applyNodeLocal(self, node): + op_type = node.op_type + if is_fpgadataflow_node(node) is True: + try: + # lookup op_type in registry of CustomOps + prod = registry.getCustomOp(node) + assert op_type != "StreamingFIFO", "Found existing FIFOs" + period = prod.get_nodeattr("io_chrc_period") + prod_chrc = prod.get_nodeattr("io_chrc_out")[0] + assert ( + len(prod_chrc) == 2 * period + ), "Found unexpected characterization attribute" + if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]): + # FIFO depth already set, can skip this node + return (node, False) + + # find consumers + model = self.ref_input_model + out_fifo_depths = [] + for output_name in node.output: + cons_node = model.find_consumer(output_name) + if cons_node is None: + # could be final node, will be overridden if so + # need an entry in the list anyway + out_fifo_depths.append(2) + continue + cons = registry.getCustomOp(cons_node) + cons_chrc = cons.get_nodeattr("io_chrc_in")[0] + # find minimum phase shift satisfying the constraint + pshift_min = period - 1 + for pshift_cand in range(period): + prod_chrc_part = prod_chrc[pshift_cand:period] + cons_chrc_part = cons_chrc[: period - pshift_cand] + if (prod_chrc_part >= cons_chrc_part).all(): + pshift_min = pshift_cand + break + prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period)] + cons_chrc_part = cons_chrc[:period] + fifo_depth = int((prod_chrc_part - cons_chrc_part).max()) + out_fifo_depths.append(fifo_depth) + # set output FIFO depth for this (producing) node + # InsertFIFO looks at the max of (outFIFODepth, inFIFODepth) + # for each tensor + prod.set_nodeattr("outFIFODepths", out_fifo_depths) + + except KeyError: + # exception if op_type is not supported + raise Exception( + "Custom op_type %s is currently not supported." % op_type + ) + return (node, False) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 78200b280960ad53e3e84d44394c10296c432ba5..79bd717a5d96e7a9839740d73254db53e5133e13 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -70,16 +70,26 @@ class InsertFIFO(Transformation): node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth' of the subsequent node. max() of these two values sets the FIFO depth. - Normally, shallow-depth (<=2) FIFOs won't be created since HLS streaming - interfaces already have a degree of buffering. You can set - create_shallow_fifos=True to override this default behavior. + Constructor arguments: + - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of + Verilog FIFOs (Q_srl.v) + - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for + large FIFOs implemented by Vivado + - create_shallow_fifos : Normally, shallow-depth (<=2) FIFOs won't be created since + HLS streaming interfaces already have a degree of buffering. + Override with this parameter. + The other node attributes necessary to create a FIFO node are taken from the node the FIFO node is inserted after: 'folded_shape' and 'dtype'""" - def __init__(self, create_shallow_fifos=False): + def __init__( + self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto" + ): super().__init__() self.create_shallow_fifos = create_shallow_fifos + self.max_qsrl_depth = max_qsrl_depth + self.vivado_ram_style = vivado_ram_style def apply(self, model): graph = model.graph @@ -88,8 +98,8 @@ class InsertFIFO(Transformation): for first_node in graph.node: node_ind += 1 if _suitable_node(first_node): - for n_output in first_node.output: - consumers = model.find_consumers(n_output) + for idx_out, output_name in enumerate(first_node.output): + consumers = model.find_consumers(output_name) if consumers == []: continue if len(consumers) > 1: @@ -108,11 +118,9 @@ class InsertFIFO(Transformation): # input of the second node is equal n1 = getCustomOp(consumer) for idx, inp in enumerate(consumer.input): - if inp == n_output: - if idx == 0: - fld_shape_2 = n1.get_folded_input_shape() - else: - fld_shape_2 = n1.get_folded_input_shape(ind=idx) + if inp == output_name: + fld_shape_2 = n1.get_folded_input_shape(ind=idx) + idx_inp = idx assert _suitable_folded_shapes( fld_shape, fld_shape_2 ), """The @@ -122,12 +130,10 @@ class InsertFIFO(Transformation): # check if outFIFOdepth attribute of first node # and inFIFOdepth attribute of consumer node is equal - n0_depth = n0.get_nodeattr("outFIFODepth") - n1_depth = n1.get_nodeattr("inFIFODepth") - if n0_depth == n1_depth: - fifo_depth = n0_depth - elif n0_depth != n1_depth: - fifo_depth = max(n0_depth, n1_depth) + n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out] + n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp] + + fifo_depth = max(n0_depth, n1_depth) if fifo_depth > 2 or self.create_shallow_fifos: # assumption: HLS streaming components already have @@ -143,25 +149,40 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) + if ( + self.max_qsrl_depth is None + or fifo_depth <= self.max_qsrl_depth + ): + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", - [n_output], + [output_name], [fifo_output_tensor.name], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.insert(node_ind + 1, fifo_node) # set fifo output tensor as new input tensor of second node for idx, inp in enumerate(consumer.input): - if inp == n_output: + if inp == output_name: consumer.input[idx] = fifo_output_tensor.name # ensure created FIFO depth is reflected on both sides - n0.set_nodeattr("outFIFODepth", fifo_depth) - n1.set_nodeattr("inFIFODepth", fifo_depth) + odepths = n0.get_nodeattr("outFIFODepths") + odepths[idx_out] = fifo_depth + n0.set_nodeattr("outFIFODepths", odepths) + idepths = n1.get_nodeattr("inFIFODepths") + idepths[idx_inp] = fifo_depth + n1.set_nodeattr("inFIFODepths", idepths) + graph_modified = True if graph_modified is False: @@ -177,13 +198,9 @@ class InsertFIFO(Transformation): n_input = first_node.input[inp_ind] n0 = getCustomOp(first_node) # determine fifo node attributes - if inp_ind == 0: - fld_shape = n0.get_folded_input_shape() - dtype = n0.get_input_datatype() - else: - fld_shape = n0.get_folded_input_shape(inp_ind) - dtype = n0.get_input_datatype(inp_ind) - fifo_depth = n0.get_nodeattr("inFIFODepth") + fld_shape = n0.get_folded_input_shape(inp_ind) + dtype = n0.get_input_datatype(inp_ind) + fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind] if fifo_depth <= 2: warnings.warn("Overriding input FIFO depth to 32") @@ -198,6 +215,11 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) + if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth: + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", [n_input], @@ -207,6 +229,8 @@ class InsertFIFO(Transformation): depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.insert(0, fifo_node) @@ -227,10 +251,11 @@ class InsertFIFO(Transformation): ), """Insert tlast marker should be done after inserting the FIFOs""" n0 = getCustomOp(final_node) + out_ind = list(final_node.output).index(graph_out_name) # determine fifo node attributes - fld_shape = n0.get_folded_output_shape() - dtype = n0.get_output_datatype() - fifo_depth = n0.get_nodeattr("outFIFODepth") + fld_shape = n0.get_folded_output_shape(out_ind) + dtype = n0.get_output_datatype(out_ind) + fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind] if fifo_depth <= 2: warnings.warn("Overriding output FIFO depth to 32") @@ -245,6 +270,11 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_input_tensor) model.set_tensor_datatype(fifo_input_tensor.name, dtype) + if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth: + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", [fifo_input_tensor.name], @@ -254,6 +284,8 @@ class InsertFIFO(Transformation): depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.append(fifo_node) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 90ea853b6072b145df64a8a73ee93c65989fe447..f715aaeffb6d4d00f2e14c5fb25ec931443d5d97 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -250,14 +250,21 @@ class InsertAndSetFIFODepths(Transformation): ) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) + ifd = node.get_nodeattr("inFIFODepths") + ofd = node.get_nodeattr("outFIFODepths") if self.max_depth is not None: - node.set_nodeattr("inFIFODepth", self.max_depth) - node.set_nodeattr("outFIFODepth", self.max_depth) + ifd = [self.max_depth] * len(ifd) + ofd = [self.max_depth] * len(ofd) else: - i_depth = np.prod(node.get_folded_input_shape()[:-1]) - o_depth = np.prod(node.get_folded_output_shape()[:-1]) - node.set_nodeattr("inFIFODepth", i_depth) - node.set_nodeattr("outFIFODepth", o_depth) + # set each FIFO to its tensor size + # (except stream width hence the :-1) + for i in range(len(ifd)): + ifd[i] = np.prod(node.get_folded_input_shape(i)[:-1]) + for o in range(len(ofd)): + ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1]) + node.set_nodeattr("inFIFODepths", ifd) + node.set_nodeattr("outFIFODepths", ofd) + if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": @@ -380,8 +387,11 @@ class InsertAndSetFIFODepths(Transformation): reset_implementation(node_inst) del fifos[node.name] else: - getCustomOp(node).set_nodeattr("inFIFODepth", 0) - getCustomOp(node).set_nodeattr("outFIFODepth", 0) + inst = getCustomOp(node) + ifd = inst.get_nodeattr("inFIFODepths") + ofd = inst.get_nodeattr("outFIFODepths") + inst.set_nodeattr("inFIFODepths", [0] * len(ifd)) + inst.set_nodeattr("outFIFODepths", [0] * len(ofd)) # for every extw node we changed from external to decoupled, # change back and reset implementation if node.op_type in extw_optypes: diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 103f18b514c23c4e1ad35a85d020dc0481aa9c47..5f787d1f889645d04884aed9b89a0b1c91d1f418 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -569,8 +569,8 @@ class TestEnd2End: for node in hls_layers: if node.op_type != "StreamingFIFO": op_inst = getCustomOp(node) - assert op_inst.get_nodeattr("inFIFODepth") == 0 - assert op_inst.get_nodeattr("outFIFODepth") == 0 + assert op_inst.get_nodeattr("inFIFODepths") == [0] + assert op_inst.get_nodeattr("outFIFODepths") == [0] model.save( get_checkpoint_name( topology, wbits, abits, QONNX_export, "fifodepth_" + kind diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py new file mode 100644 index 0000000000000000000000000000000000000000..5fd1439bd055782692bac404622137e166ef5e07 --- /dev/null +++ b/tests/fpgadataflow/test_fifosizing.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pytest + +import json +import shutil +from brevitas.export.onnx.generic.manager import BrevitasONNXManager + +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.util.basic import make_build_dir +from finn.util.test import get_trained_network_and_ishape + + +def fetch_test_model(topology, wbits=2, abits=2): + tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology) + (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) + chkpt_name = tmp_output_dir + "/model.onnx" + BrevitasONNXManager.export(model, ishape, chkpt_name) + return tmp_output_dir + + +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fifosizing_linear(): + tmp_output_dir = fetch_test_model("tfc") + cfg = build_cfg.DataflowBuildConfig( + output_dir=tmp_output_dir, + auto_fifo_depths=True, + auto_fifo_strategy="characterize", + target_fps=10000, + synth_clk_period_ns=10.0, + board="Pynq-Z1", + rtlsim_batch_size=100, + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + ], + default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, + ) + build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) + with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: + est_data = json.load(f) + with open(tmp_output_dir + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + assert ( + float(sim_data["throughput[images/s]"]) + / float(est_data["estimated_throughput_fps"]) + > 0.9 + ) + shutil.rmtree(tmp_output_dir) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index d1895a12675dce69070d280381a9982060e20c21..a7e7eba7ee8de81ec5eebe3e270e8e1d28564a00 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -42,6 +42,7 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP @@ -417,3 +418,67 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 + + +# mem_mode: const or decoupled +@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [8]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [8]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [32]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [32]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + for node in model.graph.node: + # lookup op_type in registry of CustomOps + inst = getCustomOp(node) + inst.set_nodeattr("mem_mode", mem_mode) + total_fold = nf * sf + exp_total_cycles = total_fold + 10 + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + model = model.transform(DeriveCharacteristic(exp_total_cycles)) + node_inst = getCustomOp(model.graph.node[0]) + period_attr = node_inst.get_nodeattr("io_chrc_period") + assert period_attr == exp_total_cycles + chrc_in = node_inst.get_nodeattr("io_chrc_in") + chrc_out = node_inst.get_nodeattr("io_chrc_out") + assert chrc_in.shape == (1, 2 * exp_total_cycles) + assert chrc_out.shape == (1, 2 * exp_total_cycles) + # first sf cycles should read input continuously + assert (chrc_in[0, :sf] == range(1, sf + 1)).all() + # all outputs should be produced within the exp n of cycles + assert chrc_out[0, exp_total_cycles] == nf