diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 51e751610108289c04fa0ae72fe593cbb0c16a86..38214427094ff9b43d86606a27f0f3473cab4ef8 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -259,6 +259,10 @@ class DataflowBuildConfig: AutoFIFOSizingMethod ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM + #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test + #: if set to True, always using Python instead + force_python_rtlsim: Optional[bool] = False + #: Memory resource type for large FIFOs #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index ad7e1da054d35b52d8f41926aeee6fe1523cf2fd..790145054d49af008059364e466c1c28e588ad2f 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -487,6 +487,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), vivado_ram_style=cfg.large_fifo_mem_style, + force_python_sim=cfg.force_python_rtlsim, ) ) else: diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index f715aaeffb6d4d00f2e14c5fb25ec931443d5d97..948e87511d94a582ce245fd9c70dfec1a8ad9e0a 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -42,7 +42,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.util.fpgadataflow import is_fpgadataflow_node -from finn.util.pyverilator import pyverilate_stitched_ip +from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim def reset_implementation(node): @@ -227,6 +227,7 @@ class InsertAndSetFIFODepths(Transformation): max_depth=None, swg_exception=True, vivado_ram_style="auto", + force_python_sim=False, ): super().__init__() self.fpgapart = fpgapart @@ -235,6 +236,7 @@ class InsertAndSetFIFODepths(Transformation): self.max_depth = max_depth self.swg_exception = swg_exception self.vivado_ram_style = vivado_ram_style + self.force_python_sim = force_python_sim def apply(self, model): # these optypes may potentially use external weights @@ -306,57 +308,75 @@ class InsertAndSetFIFODepths(Transformation): model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") - # calculate input frequency (number of cycles for each input word) - first_node = getCustomOp(model.graph.node[0]) - ncycles_per_input = max( - 1, - int( - math.ceil( - perf["max_cycles"] - / ( - np.prod(first_node.get_folded_input_shape()) - / first_node.get_folded_input_shape()[-1] + if self.force_python_sim: + # do rtlsim in Python for FIFO sizing + # calculate input frequency (number of cycles for each input word) + first_node = getCustomOp(model.graph.node[0]) + ncycles_per_input = max( + 1, + int( + math.ceil( + perf["max_cycles"] + / ( + np.prod(first_node.get_folded_input_shape()) + / first_node.get_folded_input_shape()[-1] + ) ) - ) - ), - ) + ), + ) - # set sufficiently large threshold for 1 image to fully execute and exit - ncycles = int(latency + max_cycles) + # set sufficiently large threshold for 1 image to fully execute and exit + ncycles = int(latency + max_cycles) - # prepare pyverilator model - sim = pyverilate_stitched_ip(model) + # prepare pyverilator model + sim = pyverilate_stitched_ip(model) - reset_rtlsim(sim) - toggle_clk(sim) + reset_rtlsim(sim) + toggle_clk(sim) - # set all input valids to 0 and output readies to 1 - # set input data to some constant - set_signal(sim, "tvalid", 0) - set_signal(sim, "tready", 1) - set_signal(sim, "tdata", 0) + # set all input valids to 0 and output readies to 1 + # set input data to some constant + set_signal(sim, "tvalid", 0) + set_signal(sim, "tready", 1) + set_signal(sim, "tdata", 0) + + output_detected = False + while ncycles > 0: + toggle_clk(sim) + # set/unset valids + if ncycles % ncycles_per_input == 0: + set_signal(sim, "tvalid", 1) + else: + set_signal(sim, "tvalid", 0) - output_detected = False - while ncycles > 0: - toggle_clk(sim) - # set/unset valids - if ncycles % ncycles_per_input == 0: - set_signal(sim, "tvalid", 1) - else: - set_signal(sim, "tvalid", 0) + # since latency estimation is very pessimistic, detect first output + # and fast-forward the sim + if get_signal(sim, "tvalid") != 0 and not output_detected: + ncycles = max_cycles + output_detected = True + else: + ncycles = ncycles - 1 - # since latency estimation is very pessimistic, detect first output - # and fast-forward the sim - if get_signal(sim, "tvalid") != 0 and not output_detected: - ncycles = max_cycles - output_detected = True + if not output_detected: + warnings.warn( + "No output detected, calculated FIFO depths may not be correct" + ) + else: + # do rtlsim in C++ for FIFO sizing + # determine # inputs for FIFO sizing according to topology type + swg_nodes = [ + x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type + ] + if len(swg_nodes) == 0: + # MLP, no layer overlap + # assuming half the nodes are now FIFOs, use half the # of + # nodes as # inputs to drive the imulation + n_inputs = int(len(model.graph.node) / 2) else: - ncycles = ncycles - 1 - - if not output_detected: - warnings.warn( - "No output detected, calculated FIFO depths may not be correct" - ) + # convnet, single input is typically enough to fill entire + # layer pipeline due to overlaps + n_inputs = 1 + sim = verilator_fifosim(model, n_inputs) for ind, node in enumerate(fifo_nodes): maxcount_name = "maxcount_%d" % ind