diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index e2f96395ad74255ad67549255608cd52737e97d9..cd14765f388d76b3e42ba88e959c4eecb87ccab0 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -30,6 +30,7 @@ import numpy as np from shutil import copy import subprocess import math +import warnings from finn.custom_op.fpgadataflow import HLSCustomOp from finn.core.datatype import DataType @@ -178,14 +179,11 @@ class StreamingFIFO(HLSCustomOp): depth = self.get_nodeattr("depth") # depth has to be between 2 and 256 with the current # StreamingFIFO implementation - assert ( - depth >= 2 - ), """Depth is too low. Please set node attribute "depth" to a value - between 2 and 256""" - assert ( - depth <= 256 - ), """Depth is too high. Please set node attribute "depth" to a value - between 2 and 256""" + assert depth >= 2, """Depth is too low""" + if depth > 256 and self.get_nodeattr("impl_style") == "rtl": + warnings.warn( + "Depth is high, set between 2 and 256 for efficient SRL implementation" + ) # derive normal shape from folded shape # StreamingFIFOs are inserted in between fpgadataflow nodes # the folded shape could be for example (1, nf, pe) @@ -424,7 +422,6 @@ class StreamingFIFO(HLSCustomOp): else: return (math.ceil(depth / 4096)) * (math.ceil(W / 72)) - def bram_efficiency_estimation(self): depth = self.get_nodeattr("depth") W = self.get_instream_width() @@ -450,4 +447,3 @@ class StreamingFIFO(HLSCustomOp): ram_luts = 0 return int(address_luts + ram_luts) - diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 6f7fde0c4faba09e584eb578819f44c18639bc9d..38d438927677b853e1f256adcc1ca3048cdf1f28 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -4,6 +4,7 @@ from onnx import helper as oh from finn.custom_op.registry import getCustomOp from finn.transformation import Transformation from finn.util.fpgadataflow import is_fpgadataflow_node +import warnings import numpy as np @@ -56,66 +57,81 @@ class InsertFIFO(Transformation): for n in graph.node: node_ind += 1 if _suitable_node(n): - n_output = n.output[0] - consumer = model.find_consumer(n_output) - if _suitable_node(consumer) is True: - n0 = getCustomOp(n) - # determine fifo node attributes - fld_shape = n0.get_folded_output_shape() - dtype = n0.get_output_datatype() - - # check if folded_shape of output of first node and - # input of the second node is equal - n1 = getCustomOp(consumer) - fld_shape_2 = n1.get_folded_input_shape() - assert _suitable_folded_shapes( - fld_shape, fld_shape_2 - ), """The - folded output shape of the first node is not the same as the - folded output shape of the second node. A streaming fifo can't - be implemented in between these nodes.""" - - # check if outFIFOdepth attribute of first node - # and inFIFOdepth attribute of consumer node is equal - n0_depth = n0.get_nodeattr("outFIFODepth") - n1_depth = n1.get_nodeattr("inFIFODepth") - if n0_depth == n1_depth: - fifo_depth = n0_depth - elif n0_depth != n1_depth: - fifo_depth = max(n0_depth, n1_depth) - - if fifo_depth > 2: - # assumption: HLS streaming components already have - # depth-2 FIFOs on inputs and outputs, so no point - # creating additional small FIFOs in between -- - # we only create the larger FIFOs specified - # create fifo node - fifo_output_tensor = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - n0.get_normal_output_shape(), + for n_output in n.output: + consumers = model.find_consumers(n_output) + if consumers is None: + continue + if len(consumers) > 1: + warnings.warn( + n.name + + ": HLS node with fan-out higher than 1 cannot be stitched" ) - graph.value_info.append(fifo_output_tensor) - model.set_tensor_datatype(fifo_output_tensor.name, dtype) - - fifo_node = oh.make_node( - "StreamingFIFO", - [n_output], - [fifo_output_tensor.name], - domain="finn", - backend="fpgadataflow", - depth=fifo_depth, - folded_shape=fld_shape, - dataType=str(dtype.name), - ) - # insert fifo - graph.node.insert(node_ind + 1, fifo_node) - # set fifo output tensor as new input tensor of second node - consumer.input[0] = fifo_output_tensor.name - # ensure created FIFO depth is reflected on both sides - n0.set_nodeattr("outFIFODepth", fifo_depth) - n1.set_nodeattr("inFIFODepth", fifo_depth) - graph_modified = True + consumer = consumers[0] + if _suitable_node(consumer) is True: + n0 = getCustomOp(n) + # determine fifo node attributes + fld_shape = n0.get_folded_output_shape() + dtype = n0.get_output_datatype() + + # check if folded_shape of output of first node and + # input of the second node is equal + n1 = getCustomOp(consumer) + for idx, inp in enumerate(consumer.input): + if inp == n_output: + if idx == 0: + fld_shape_2 = n1.get_folded_input_shape() + else: + fld_shape_2 = n1.get_folded_input_shape(ind=idx) + assert _suitable_folded_shapes( + fld_shape, fld_shape_2 + ), """The + folded output shape of the first node is not the same as the + folded output shape of the second node. A streaming fifo can't + be implemented in between these nodes.""" + + # check if outFIFOdepth attribute of first node + # and inFIFOdepth attribute of consumer node is equal + n0_depth = n0.get_nodeattr("outFIFODepth") + n1_depth = n1.get_nodeattr("inFIFODepth") + if n0_depth == n1_depth: + fifo_depth = n0_depth + elif n0_depth != n1_depth: + fifo_depth = max(n0_depth, n1_depth) + + if fifo_depth > 2: + # assumption: HLS streaming components already have + # depth-2 FIFOs on inputs and outputs, so no point + # creating additional small FIFOs in between -- + # we only create the larger FIFOs specified + # create fifo node + fifo_output_tensor = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + n0.get_normal_output_shape(), + ) + graph.value_info.append(fifo_output_tensor) + model.set_tensor_datatype(fifo_output_tensor.name, dtype) + + fifo_node = oh.make_node( + "StreamingFIFO", + [n_output], + [fifo_output_tensor.name], + domain="finn", + backend="fpgadataflow", + depth=fifo_depth, + folded_shape=fld_shape, + dataType=str(dtype.name), + ) + # insert fifo + graph.node.insert(node_ind + 1, fifo_node) + # set fifo output tensor as new input tensor of second node + for idx, inp in enumerate(consumer.input): + if inp == n_output: + consumer.input[idx] = fifo_output_tensor.name + # ensure created FIFO depth is reflected on both sides + n0.set_nodeattr("outFIFODepth", fifo_depth) + n1.set_nodeattr("inFIFODepth", fifo_depth) + graph_modified = True if graph_modified is False: # insert FIFO as first node, except when first node is DMA @@ -131,30 +147,31 @@ class InsertFIFO(Transformation): dtype = n0.get_input_datatype() fifo_depth = n0.get_nodeattr("inFIFODepth") - # create fifo node - fifo_output_tensor = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - n0.get_normal_input_shape(), - ) - graph.value_info.append(fifo_output_tensor) - model.set_tensor_datatype(fifo_output_tensor.name, dtype) - - fifo_node = oh.make_node( - "StreamingFIFO", - [n_input], - [fifo_output_tensor.name], - domain="finn", - backend="fpgadataflow", - depth=fifo_depth, - folded_shape=fld_shape, - dataType=str(dtype.name), - ) - # insert fifo - graph.node.insert(0, fifo_node) - - # set fifo output tensor as new input tensor of second node - n.input[0] = fifo_output_tensor.name + if fifo_depth > 2: + # create fifo node + fifo_output_tensor = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + n0.get_normal_input_shape(), + ) + graph.value_info.append(fifo_output_tensor) + model.set_tensor_datatype(fifo_output_tensor.name, dtype) + + fifo_node = oh.make_node( + "StreamingFIFO", + [n_input], + [fifo_output_tensor.name], + domain="finn", + backend="fpgadataflow", + depth=fifo_depth, + folded_shape=fld_shape, + dataType=str(dtype.name), + ) + # insert fifo + graph.node.insert(0, fifo_node) + + # set fifo output tensor as new input tensor of second node + n.input[0] = fifo_output_tensor.name # insert FIFO as last node, except when last node is DMA if ( @@ -173,29 +190,30 @@ class InsertFIFO(Transformation): dtype = n0.get_output_datatype() fifo_depth = n0.get_nodeattr("outFIFODepth") - # create fifo node - fifo_input_tensor = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - n0.get_normal_output_shape(), - ) - graph.value_info.append(fifo_input_tensor) - model.set_tensor_datatype(fifo_output_tensor.name, dtype) - - fifo_node = oh.make_node( - "StreamingFIFO", - [fifo_input_tensor.name], - [graph_out_name], - domain="finn", - backend="fpgadataflow", - depth=fifo_depth, - folded_shape=fld_shape, - dataType=str(dtype.name), - ) - # insert fifo - graph.node.append(fifo_node) - - # set fifo output tensor as new input tensor of second node - n.output[0] = fifo_input_tensor.name + if fifo_depth > 2: + # create fifo node + fifo_input_tensor = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + n0.get_normal_output_shape(), + ) + graph.value_info.append(fifo_input_tensor) + model.set_tensor_datatype(fifo_input_tensor.name, dtype) + + fifo_node = oh.make_node( + "StreamingFIFO", + [fifo_input_tensor.name], + [graph_out_name], + domain="finn", + backend="fpgadataflow", + depth=fifo_depth, + folded_shape=fld_shape, + dataType=str(dtype.name), + ) + # insert fifo + graph.node.append(fifo_node) + + # set fifo output tensor as new input tensor of second node + n.output[0] = fifo_input_tensor.name return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 798bbd335f0028d1103d992fd2b8b9cd30bbb6e1..71712d8ca8e3fb7f4050dd0f489d74f177f2cab8 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -26,7 +26,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import copy import math import numpy as np import warnings @@ -39,7 +38,7 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames from finn.core.rtlsim_exec import ( _reset_rtlsim, _toggle_clk, @@ -49,6 +48,12 @@ from finn.util.fpgadataflow import ( ) +def reset_implementation(node): + node.set_nodeattr("code_gen_dir_ipgen", "") + node.set_nodeattr("ipgen_path", "") + node.set_nodeattr("ip_path", "") + + def set_signal(sim, keyw, value): for i in range(len(sim.inputs)): input_name = sim.inputs[i][0] @@ -56,6 +61,13 @@ def set_signal(sim, keyw, value): sim.io[input_name] = value +def get_signal(sim, keyw): + for i in range(len(sim.outputs)): + output_name = sim.outputs[i][0] + if keyw in output_name: + return sim.io[output_name] + + def optimize_depth(depth): if depth <= 2: return 2 @@ -63,7 +75,7 @@ def optimize_depth(depth): return 32 if depth <= 1024: return int(2 ** math.ceil(math.log2(depth))) - return int(math.ceil(depth / 1024)) + return int(math.ceil(depth / 1024) * 1024) class SetFIFODepths(Transformation): @@ -73,28 +85,28 @@ class SetFIFODepths(Transformation): images on input (random/constant data) and keep track of maximum occupancy counts in each FIFO.""" - def __init__(self, fpgapart, clk_ns=10.0): + def __init__(self, fpgapart, clk_ns=10.0, max_qsrl_depth=256, max_depth=2 ** 14): super().__init__() self.fpgapart = fpgapart self.clk_ns = clk_ns + self.max_qsrl_depth = max_qsrl_depth + self.max_depth = max_depth def apply(self, model): - orig_model = model - - # work on a copy of the model - model = copy.deepcopy(model) - - # change external to decoupled and warn user; + # change external to decoupled and warn user # this way we are sure we have exactly one input/output + modified_fc_nodes = [] for node in model.graph.node: node = getCustomOp(node) - node.set_nodeattr("inFIFODepth", 2 ** 14) - node.set_nodeattr("outFIFODepth", 2 ** 14) + node.set_nodeattr("inFIFODepth", self.max_depth) + node.set_nodeattr("outFIFODepth", self.max_depth) if node.onnx_node.op_type == "StreamingFCLayer_Batch": mmode = node.get_nodeattr("mem_mode") if mmode == "external": + modified_fc_nodes.append(node.onnx_node.name) node.set_nodeattr("mem_mode", "decoupled") + reset_implementation(node) warnings.warn( "Changed mem_mode from external to decoupled for " + node.onnx_node.name @@ -104,27 +116,17 @@ class SetFIFODepths(Transformation): model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) # gather FIFO names, check they are of expected depth fifos = {} for node in model.graph.node: if node.op_type == "StreamingFIFO": - consumer = model.find_consumers(node.output[0]) - if consumer is not None: - consumer = consumer[0].name - producer = model.find_producer(node.input[0]) - if producer is not None: - producer = producer.name - fifos[node.name] = { - "depth": 0, - "consumer": consumer, - "producer": producer, - } + fifos[node.name] = 0 node = getCustomOp(node) - # check depths - # if model came in with FIFOs, the depths will not have been updated - if node.get_nodeattr("depth") != 2 ** 14: - node.set_nodeattr("depth", 2 ** 14) + # check depths and fix as necessary + if node.get_nodeattr("depth") != self.max_depth: + node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) @@ -138,12 +140,17 @@ class SetFIFODepths(Transformation): # calculate input frequency (number of cycles for each input word) first_node = getCustomOp(model.graph.node[0]) - ncycles_per_input = math.ceil( - perf["max_cycles"] - / ( - np.prod(first_node.get_folded_input_shape()) - / first_node.get_folded_input_shape()[-1] - ) + ncycles_per_input = max( + 1, + int( + math.ceil( + perf["max_cycles"] + / ( + np.prod(first_node.get_folded_input_shape()) + / first_node.get_folded_input_shape()[-1] + ) + ) + ), ) # set sufficiently large threshold for 1 image to fully execute and exit @@ -161,6 +168,7 @@ class SetFIFODepths(Transformation): set_signal(sim, "tready", 1) set_signal(sim, "tdata", 0) + output_detected = False while ncycles > 0: _toggle_clk(sim) # set/unset valids @@ -181,37 +189,83 @@ class SetFIFODepths(Transformation): current_count = current_addr + 2 else: current_count = current_state - if current_count > fifos[key]["depth"]: - fifos[key]["depth"] = current_count - ncycles = ncycles - 1 - - # for each node in the original graph, determine in/outFIFODepth - ret = {} - for key in fifos: - predecessor_node = fifos[key]["producer"] - if predecessor_node is not None: - if predecessor_node not in ret: - ret[predecessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0} - out_depth = ret[predecessor_node]["outFIFODepth"] - ret[predecessor_node]["outFIFODepth"] = max( - out_depth, fifos[key]["depth"] - ) + if current_count > fifos[key]: + fifos[key] = current_count - succcessor_node = fifos[key]["consumer"] - if succcessor_node is not None: - if succcessor_node not in ret: - ret[succcessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0} - in_depth = ret[succcessor_node]["inFIFODepth"] - ret[succcessor_node]["inFIFODepth"] = max(in_depth, fifos[key]["depth"]) - - # tweak and apply depths to original model - for node in orig_model.graph.node: - if node.name in ret: - depths = ret[node.name] - node = getCustomOp(node) - node.set_nodeattr("inFIFODepth", optimize_depth(depths["inFIFODepth"])) - node.set_nodeattr( - "outFIFODepth", optimize_depth(depths["outFIFODepth"]) - ) + # since latency estimation is very pessimistic, detect first output + # and fast-forward the sim + if get_signal(sim, "tvalid") != 0 and not output_detected: + ncycles = max_cycles + output_detected = True + else: + ncycles = ncycles - 1 + + if not output_detected: + warnings.warn( + "No output detected, calculated FIFO depths may not be correct" + ) + + # Apply depths back into the model; + # also set in/outFIFODepth to zero for non-FIFO + # nodes, preventing further FIFO insertion + for node in model.graph.node: + # set FIFO depth, reset FIFO implementation, + # and set implementation/ram styles + if node.op_type == "StreamingFIFO": + assert node.name in fifos, "FIFO node not found in size dictionary" + # set depth of FIFO + depth = optimize_depth(fifos[node.name]) + node_inst = getCustomOp(node) + node_inst.set_nodeattr("depth", depth) + # Set FIFO implementation/ram styles + if depth > self.max_qsrl_depth: + node_inst.set_nodeattr("impl_style", "vivado") + node_inst.set_nodeattr("ram_style", "auto") + else: + node_inst.set_nodeattr("impl_style", "rtl") + # reset implementation + reset_implementation(node_inst) + del fifos[node.name] + else: + getCustomOp(node).set_nodeattr("inFIFODepth", 0) + getCustomOp(node).set_nodeattr("outFIFODepth", 0) + # for every FC node we changed from external to decoupled, + # change back and reset implementation + if node.op_type == "StreamingFCLayer_Batch": + if node.name in modified_fc_nodes: + node_inst = getCustomOp(node) + node_inst.set_nodeattr("mem_mode", "external") + reset_implementation(node_inst) + modified_fc_nodes.remove(node.name) + + assert ( + len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0 + ), "FIFO/FC nodes left untouched after model reconfiguration" + + # Remove FIFOs which have depth <= 2 + shallow_fifos = [] + # First, bypass them + for node in model.graph.node: + if ( + node.op_type == "StreamingFIFO" + and getCustomOp(node).get_nodeattr("depth") <= 2 + ): + shallow_fifos.append(node) + consumers = model.find_consumers(node.output[0]) + if consumers is None: + producer = model.find_producer(node.input[0]) + for idx, inp in enumerate(producer.output): + if inp == node.input[0]: + producer.output[idx] = node.output[0] + else: + assert len(consumers) == 1, "Fanout detected from FIFO output" + consumer = consumers[0] + # set fifo input tensor as new input tensor of second node + for idx, inp in enumerate(consumer.input): + if inp == node.output[0]: + consumer.input[idx] = node.input[0] + # now filter out + for node_to_remove in shallow_fifos: + model.graph.node.remove(node_to_remove) - return (orig_model, False) + return (model, False) diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index 3fe747a84985b2702ffb1e5855d9071362efebda..f849ee5267de1ddab96a948f8c3408c62957fd8a 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -86,21 +86,40 @@ def pyverilate_stitched_ip(model): def file_to_basename(x): return os.path.basename(os.path.realpath(x)) - all_verilog_dirs = list(map(file_to_dir, all_verilog_srcs)) - all_verilog_files = list( - set( - filter( - lambda x: x.endswith(".v"), - list(map(file_to_basename, all_verilog_srcs)), - ) - ) - ) - top_module_name = model.get_metadata_prop("wrapper_filename") - top_module_name = file_to_basename(top_module_name).strip(".v") + top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename")) + top_module_name = top_module_file_name.strip(".v") build_dir = make_build_dir("pyverilator_ipstitched_") + + # dump all Verilog code to a single file + # this is because large models with many files require + # a verilator command line too long for bash on most systems + # NOTE: there are duplicates in this list, and some files + # are identical but in multiple directories (regslice_core.v) + + # remove duplicates from list by doing list -> set -> list + all_verilog_files = list(set(filter(lambda x: x.endswith(".v"), all_verilog_srcs))) + + # remove all but one instances of regslice_core.v + filtered_verilog_files = [] + remove_entry = False + for vfile in all_verilog_files: + if "regslice_core" in vfile: + if not remove_entry: + filtered_verilog_files.append(vfile) + remove_entry = True + else: + filtered_verilog_files.append(vfile) + + # concatenate all verilog code into a single file + with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf: + for vfile in filtered_verilog_files: + with open(vfile) as rf: + wf.write("//Added from " + vfile + "\n\n") + wf.write(rf.read()) + sim = PyVerilator.build( - all_verilog_files, - verilog_path=all_verilog_dirs, + top_module_file_name, + verilog_path=[vivado_stitch_proj_dir], build_dir=build_dir, trace_depth=get_rtlsim_trace_depth(), top_module_name=top_module_name,