diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py index 05e41a48a8f4cb34616bf06c01b652afb9ae4257..38940ccb94f11fe49af5f49ee020f150326a026c 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py @@ -63,7 +63,7 @@ class AddStreams_Batch(HLSCustomOp): ishape = tuple(vecs + [ich]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") assert ich % pe == 0, "PE must divide NumChannels" @@ -362,5 +362,5 @@ class AddStreams_Batch(HLSCustomOp): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() - intf_names["s_axis"] = ["in0_V_V", "in1_V_V"] + intf_names["s_axis"].append(("in1_V_V", self.get_instream_width_padded())) return intf_names diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index 10a8051730217b56873b5a53c0803e3b90dada90..73da77bd3f940cee5ffd10fcfc43571f1a612eb4 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -312,7 +312,8 @@ class DuplicateStreams_Batch(HLSCustomOp): def docompute(self): self.code_gen_dict["$DOCOMPUTE$"] = [ """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format( - self.get_outstream_width(), self.get_number_output_values() // 2, + self.get_outstream_width(), + self.get_number_output_values() // 2, ) ] @@ -378,5 +379,8 @@ class DuplicateStreams_Batch(HLSCustomOp): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() - intf_names["m_axis"] = ["out0_V_V", "out1_V_V"] + intf_names["m_axis"] = [ + ("out0_V_V", self.get_outstream_width_padded()), + ("out1_V_V", self.get_outstream_width_padded()), + ] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 02912b2d5f45b3bab0eaca13ee0a0bf19bf9cfca..39069e4c157f37ea65acf7c7b3da7a78e1ab2d0e 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -130,8 +130,8 @@ class HLSCustomOp(CustomOp): intf_names = {} intf_names["clk"] = ["ap_clk"] intf_names["rst"] = ["ap_rst_n"] - intf_names["s_axis"] = ["in0_V_V"] - intf_names["m_axis"] = ["out_V_V"] + intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())] + intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())] intf_names["aximm"] = [] intf_names["axilite"] = [] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index a6cddcc4aeb45957c16249cd57f122fe5e58b85a..857496a2614894588ebf065db3e384cf2cecf106 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -355,11 +355,9 @@ class IODMA(HLSCustomOp): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() if self.get_nodeattr("direction") == "out": - intf_names["s_axis"] = ["in0_V_V"] intf_names["m_axis"] = [] else: intf_names["s_axis"] = [] - intf_names["m_axis"] = ["out_V_V"] intf_names["axilite"] = ["s_axi_control"] - intf_names["aximm"] = ["m_axi_gmem"] + intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("intfWidth"))] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py index 23c1779a27c123583c0c8af5f53d022d03e78126..4d84b74dce001fca769ed2850a8f718ac942f14c 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py @@ -395,8 +395,8 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 8868002c9e2cb8726eeb573e104140e3e1a61d27..3cc01ade73fc6b735509f2839e5c10785a8b9f54 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -444,12 +444,24 @@ class StreamingFCLayer_Batch(HLSCustomOp): single_pe_w = simd * weight_bits return max([weightstream, max_of_io, single_pe_w]) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") sf = mw // simd + nf = mh // pe vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [sf, simd]) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple(vecs + [sf, simd]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) + else: + raise Exception("Undefined input shape for requested input") + return folded_input_shape def get_folded_output_shape(self): @@ -1253,8 +1265,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) @@ -1348,8 +1360,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const": - # base class impl sufficient for const mode + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for StreamingFCLayer") @@ -1359,7 +1371,9 @@ class StreamingFCLayer_Batch(HLSCustomOp): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") if mem_mode == "external": - intf_names["s_axis"] = ["in0_V_V", "weights_V_V"] + intf_names["s_axis"].append( + ("weights_V_V", self.get_weightstream_width_padded()) + ) if mem_mode == "decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index fb41bceca09fe544bd729537b1af726c9c43d290..133a869b28cf9968a719e243a3266dfb25b637ba 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -343,8 +343,8 @@ class StreamingFIFO(HLSCustomOp): # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py index 9ec03ea5dd726b49b157a92addef05f85f02b644..6700019a4a430d785967a684ad1ca8d186d32bae 100644 --- a/src/finn/qnn-data/templates/driver/driver_base.py +++ b/src/finn/qnn-data/templates/driver/driver_base.py @@ -36,6 +36,7 @@ from finn.util.data_packing import ( finnpy_to_packed_bytearray, packed_bytearray_to_finnpy, ) +from warnings import warn from finn.util.basic import gen_finn_dt_tensor @@ -86,11 +87,17 @@ class FINNExampleOverlay(Overlay): self.batch_size = batch_size self.fclk_mhz = fclk_mhz if self.platform == "alveo": - self.idma = self.idma0 + if "input_dma_name" in io_shape_dict.keys(): + self.idma = getattr(self, io_shape_dict["input_dma_name"]) + else: + self.idma = self.idma0 self.odma = self.odma0 self.odma_handle = None elif self.platform == "zynq-iodma": - self.idma = self.idma0 + if "input_dma_name" in io_shape_dict.keys(): + self.idma = getattr(self, io_shape_dict["input_dma_name"]) + else: + self.idma = self.idma0 self.odma = self.odma0 # set the clock frequency as specified by user during transformations if self.fclk_mhz > 0: @@ -98,8 +105,65 @@ class FINNExampleOverlay(Overlay): else: raise ValueError("Supported platforms are zynq-iodma alveo") # load any runtime weights + self.external_weights = [] + self.load_external_weights() self.load_runtime_weights() + def load_external_weights(self): + """Load any existing runtime weights from the specified dir into the + appropriate layer of the accelerator. Note that this must be enabled + during the accelerator build process. The runtime weights directory + is specified as the class member ``runtime_weight_dir``. + + Parameters + ---------- + flush_accel: bool + Run the accelerator with dummy input after weights are written to + flush any stale weight data in the weight streamer FIFOs. + verify: bool + Whether the written weights will be re-read and verified. + """ + + w_filenames = [] + if not os.path.isdir(self.runtime_weight_dir): + return + for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir): + w_filenames.extend(filenames) + + tmp_weight_dict = {} + + for w_filename in w_filenames: + if w_filename.endswith(".npy"): + weight_tensor = np.load(self.runtime_weight_dir + "/" + w_filename) + else: + continue + + idma_name = w_filename.split(".")[0] + tmp_weight_dict[idma_name] = weight_tensor + + if self.platform != "alveo" and len(tmp_weight_dict) > 0: + # Todo: add zynq support pynq API is different + warn("external_weights are not yet supported for non-Alveo builds") + return + + for idma_name in tmp_weight_dict.keys(): + if idma_name in self.ip_dict.keys(): + iwdma = getattr(self, idma_name) + weight_tensor = tmp_weight_dict[idma_name] + weight_buf = allocate(weight_tensor.shape, dtype=np.uint8) + weight_buf[:] = weight_tensor + weight_buf.sync_to_device() + + self.external_weights += [(iwdma, weight_buf)] + + if "number_of_external_weights" in self._io_shape_dict: + hw_ext_weights = self._io_shape_dict["number_of_external_weights"] + assert len(self.external_weights) == hw_ext_weights, ( + "Number of hardware external weights and number of external " + + "weight tensors available do not match. \n" + + "Is runtime_weight_dir pointing to the correct folder?" + ) + def load_runtime_weights(self, flush_accel=True, verify=True): """Load any existing runtime weights from the specified dir into the appropriate layer of the accelerator. Note that this must be enabled @@ -124,18 +188,25 @@ class FINNExampleOverlay(Overlay): if w_filename.endswith(".dat"): with open(self.runtime_weight_dir + "/" + w_filename, "r") as f: dat = f.read() + else: + continue layer_w = np.fromiter( [int(x, 16) for x in dat.strip().split()], dtype=np.uint32 ) - layer_ind = int(w_filename.split("_")[0]) - rt_weight_dict[layer_ind] = layer_w - for layer_ind in rt_weight_dict.keys(): - cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind + sdp_ind = int(w_filename.split("_")[0]) + layer_ind = int(w_filename.split("_")[1]) + rt_weight_dict[(sdp_ind, layer_ind)] = layer_w + for sdp_ind, layer_ind in rt_weight_dict.keys(): + cand_if_name = "StreamingDataflowPartition_%d/s_axilite_%d" % ( + sdp_ind, + layer_ind, + ) if cand_if_name in self.ip_dict.keys(): layer_mmio = getattr( - self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind + getattr(self, "StreamingDataflowPartition_%d" % sdp_ind), + "s_axilite_%d" % layer_ind, ).mmio - layer_w = rt_weight_dict[layer_ind] + layer_w = rt_weight_dict[(sdp_ind, layer_ind)] layer_mmio.write_mm(0, layer_w.tobytes()) if verify: new_w = np.copy(layer_mmio.array[: layer_w.shape[0]]) @@ -289,6 +360,8 @@ class FINNExampleOverlay(Overlay): elif self.platform == "alveo": assert self.odma_handle is None, "Output DMA is already running" self.idma.start(self.ibuf_packed_device, batch_size) + for iwdma, iwbuf in self.external_weights: + iwdma.start(iwbuf, batch_size) self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size) else: raise Exception("Unrecognized platform: %s" % self.platform) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 19fa5c603bfafe16ed151e10fa8eb11a79106ede..aed5792a63ff95803b4d7ccc80cf2c94ac732ad7 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -29,6 +29,7 @@ import os import warnings import subprocess +import json from finn.transformation.base import Transformation from finn.util.basic import get_by_name, make_build_dir, is_finn_op @@ -40,6 +41,31 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ) +def is_external_input(model, node, i): + # indicate whether input i of node should be made external + # True only if input is unconnected and has no initializer + # Only esception is second input of FC layers when mem_mode is external + node_inst = getCustomOp(node) + producer = model.find_producer(node.input[i]) + if producer is None: + if model.get_initializer(node.input[i]) is None: + return True + else: + if node.op_type == "StreamingFCLayer_Batch": + if node_inst.get_nodeattr("mem_mode") == "external": + return True + return False + + +def is_external_output(model, node, i): + # indicate whether output i of node should be made external + # True only if output is unconnected + consumers = model.find_consumers(node.output[i]) + if consumers is None: + return True + return False + + class CreateStitchedIP(Transformation): """Create a Vivado IP Block Design project from all the generated IPs of a graph. All nodes in the graph must have the fpgadataflow backend attribute, @@ -134,21 +160,24 @@ class CreateStitchedIP(Transformation): if len(aximm_intf_name) != 0: self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" - % (inst_name, aximm_intf_name[0]) + % (inst_name, aximm_intf_name[0][0]) ) self.connect_cmds.append( "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]" ) - self.intf_names["aximm"] = ["m_axi_gmem0"] + self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])] assert self.has_aximm is False, "Currently limited to one AXI-MM interface" self.has_aximm = True - def connect_m_axis_external(self, node): + def connect_m_axis_external(self, node, idx=None): inst_name = node.name node_inst = getCustomOp(node) output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"] # make output axis external - for output_intf_name in output_intf_names: + for i in range(len(output_intf_names)): + if idx is not None and idx != i: + continue + output_intf_name = output_intf_names[i][0] self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, output_intf_name) @@ -158,15 +187,20 @@ class CreateStitchedIP(Transformation): % (self.m_axis_idx, output_intf_name) ) self.has_m_axis = True - self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx) + self.intf_names["m_axis"].append( + ("m_axis_%d" % self.m_axis_idx, output_intf_names[i][1]) + ) self.m_axis_idx += 1 - def connect_s_axis_external(self, node): + def connect_s_axis_external(self, node, idx=None): inst_name = node.name node_inst = getCustomOp(node) input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"] # make input axis external - for input_intf_name in input_intf_names: + for i in range(len(input_intf_names)): + if idx is not None and idx != i: + continue + input_intf_name = input_intf_names[i][0] self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, input_intf_name) @@ -176,7 +210,9 @@ class CreateStitchedIP(Transformation): % (self.s_axis_idx, input_intf_name) ) self.has_s_axis = True - self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx) + self.intf_names["s_axis"].append( + ("s_axis_%d" % self.s_axis_idx, input_intf_names[i][1]) + ) self.s_axis_idx += 1 def apply(self, model): @@ -200,57 +236,30 @@ class CreateStitchedIP(Transformation): assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist." ip_dirs += [ip_dir_value] self.create_cmds += node_inst.code_generation_ipi() - my_producer = model.find_producer(node.input[0]) self.connect_clk_rst(node) self.connect_axi(node) - if my_producer is None: - # first node in graph - self.connect_s_axis_external(node) - if node.op_type == "TLastMarker": - assert ( - node_inst.get_nodeattr("Direction") == "in" - ), """Output TLastMarker incorrect direction""" - elif node.op_type == "IODMA" and len(model.graph.node) != 1: - # don't apply this check for a 1-node partition - assert ( - node_inst.get_nodeattr("direction") == "in" - ), """Input DMA incorrect direction""" - else: - # intermediate node - # wire up input(s) to previous node output(s) - # foreach input - # find producer - # find index of producer output connected to our target input - # get names of hdl interfaces for input and producer output - # issue a TCL directive to connect input to output - # if FC layer with mode "decoupled", add a streamer on input 1 - for i in range(len(node.input)): + for i in range(len(node.input)): + if is_external_input(model, node, i): + self.connect_s_axis_external(node, idx=i) + else: producer = model.find_producer(node.input[i]) if producer is None: continue j = list(producer.output).index(node.input[i]) src_intf_name = getCustomOp( producer - ).get_verilog_top_module_intf_names()["m_axis"][j] + ).get_verilog_top_module_intf_names()["m_axis"][j][0] dst_intf_name = node_inst.get_verilog_top_module_intf_names()[ "s_axis" - ][i] + ][i][0] self.connect_cmds.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " "[get_bd_intf_pins %s/%s]" % (producer.name, src_intf_name, node.name, dst_intf_name) ) - if model.find_consumers(node.output[0]) is None: - # last node in graph - self.connect_m_axis_external(node) - if node.op_type == "TLastMarker": - assert ( - node_inst.get_nodeattr("Direction") == "out" - ), """Output TLastMarker incorrect direction""" - elif node.op_type == "IODMA" and len(model.graph.node) != 1: - assert ( - node_inst.get_nodeattr("direction") == "out" - ), """Output DMA incorrect direction""" + for i in range(len(node.output)): + if is_external_output(model, node, i): + self.connect_m_axis_external(node, idx=i) # create a temporary folder for the project prjname = "finn_vivado_stitch_proj" @@ -316,7 +325,7 @@ class CreateStitchedIP(Transformation): block_library = "finn" block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name) model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv) - model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names)) + model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names)) tcl.append( ( "ipx::package_project -root_dir %s/ip -vendor %s " diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 0f2b8ef6a4c0858cd98218538930c97c6df2ad9d..e7bf29da36e9978911c5bfc64665dba4d2edca4e 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -59,7 +59,20 @@ class InsertDWC(Transformation): n0 = getCustomOp(n) n1 = getCustomOp(consumer) n0_out_shape = n0.get_folded_output_shape() - n1_in_shape = n1.get_folded_input_shape() + + # If FC and external mem, it could be connected to input 1 + if (consumer.op_type == "StreamingFCLayer_Batch" and + n1.get_nodeattr("mem_mode") == "external"): + # get input idx + in_idx = None + for idx, n_input in enumerate(consumer.input): + if n_output == n_input: + in_idx = idx + assert in_idx is not None,"Malformed model" + n1_in_shape = n1.get_folded_input_shape(in_idx) + else: + n1_in_shape = n1.get_folded_input_shape() + if n0_out_shape[-1] != n1_in_shape[-1]: graph_modified = True # determine dwc inwidth diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 67143547557a9b24b311e69cff6f885f8745cd3c..ebd7cbe0276d3e9b4128275b0a65b1a9a40d1f80 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -48,6 +48,39 @@ class InsertIODMA(Transformation): ), "max_intfwidth must be a power of 2" self.max_intfwidth = max_intfwidth + def get_mem_init(self, weights, pe, simd): + """ + Returns matrix ready for pack_innermost_dim_as_hex_string with + reverse=False (finn.util.data_packing) to return the memory init file + little endian packed. + That is, get_mem_init returns: + elem(pe,simd) + addr = 0: [(pe-1,simd-1),(pe-1,simd-2),...(0,1),(0,0)] + addr = 1: [(pe-1,simd*2-1),.......(0,simd+1),(0,simd)] + . + """ + w_shape = weights.shape + assert len(w_shape) == 2, "weights withincorrect number of dims" + inp_w, out_w = w_shape + + assert out_w % pe == 0, "Malformed weight matrix" + assert inp_w % simd == 0, "Malformed weight matrix" + reshaped_w = np.zeros(inp_w * out_w).reshape(-1, pe * simd) + + addr = 0 + for fr in range(out_w // pe): + for fc in range(inp_w // simd): + tile = weights[ + (fc * simd) : ((fc + 1) * simd), (fr * pe) : ((fr + 1) * pe) + ] + for p in range(pe): + reshaped_w[addr, (p * simd) : ((p + 1) * simd)] = tile[ + :, p + ].transpose() + addr += 1 + reshaped_w = np.flip(reshaped_w, axis=-1) + return reshaped_w + def apply(self, model): # only makes sense for a pure fpgadataflow graph -- so we check! all_nodes = list(model.graph.node) @@ -78,11 +111,6 @@ class InsertIODMA(Transformation): return (model, False) else: if final_node.op_type != "IODMA": - # check if tensor is NHWC - assert ( - model.get_tensor_layout(graph_out_name) == DataLayout.NHWC - or model.get_tensor_layout(graph_out_name) == DataLayout.NC - ), "Data layout of output tensor must be NHWC or NC" out_shape = model.get_tensor_shape(graph_out_name) out_dtype = model.get_tensor_datatype(graph_out_name) final_node_inst = getCustomOp(final_node) @@ -123,11 +151,6 @@ class InsertIODMA(Transformation): ) model.graph.node.append(dma_node) if first_node.op_type != "IODMA": - # check if tensor is NHWC - assert ( - model.get_tensor_layout(graph_in_name) == DataLayout.NHWC - or model.get_tensor_layout(graph_in_name) == DataLayout.NC - ), "Data layout of input tensor must be NHWC or NC" in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) first_node_inst = getCustomOp(first_node) @@ -168,11 +191,6 @@ class InsertIODMA(Transformation): ) model.graph.node.insert(0, dma_node) for fc_node in fc_extw_nodes: - # check if tensor is NHWC - assert ( - model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC - or model.get_tensor_layout(graph_in_name) == DataLayout.NC - ), "Data layout of tensors must be NHWC or NC" fc_w_name = fc_node.input[1] w_shape = model.get_tensor_shape(fc_w_name) w_dtype = model.get_tensor_datatype(fc_w_name) @@ -185,21 +203,24 @@ class InsertIODMA(Transformation): # calculate width of stream output from DMA pe = get_by_name(fc_node.attribute, "PE").i simd = get_by_name(fc_node.attribute, "SIMD").i - assert pe * simd == w_shape[0], "Malformed weight matrix" streamWidth = simd * pe * w_dtype.bitwidth() # make new buffer + W = model.get_initializer(fc_w_name) + iodma_mem = self.get_mem_init(W, pe, simd) + model.set_initializer(fc_w_name, iodma_mem) + fc_node_in = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape + model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape ) model.graph.value_info.append(fc_node_in) model.set_tensor_datatype(fc_node_in.name, w_dtype) - model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name)) + model.set_initializer(fc_node_in.name, W) dma_node = oh.make_node( "IODMA", [fc_w_name], [fc_node_in.name], - numInputVectors=[w_shape[1]], - NumChannels=w_shape[0], + numInputVectors=[iodma_mem.shape[0]], + NumChannels=pe * simd, dataType=str(w_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 42f18d9a812d2db2119351dabfbb38e68c33194e..f75ef766dc939f6b8660825203e30ff3904cf5ea 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -38,12 +38,38 @@ import warnings import pkg_resources as pk from . import template_driver from finn.core.modelwrapper import ModelWrapper +import numpy as np + +from finn.util.data_packing import ( + pack_innermost_dim_as_hex_string, + hexstring2npbytearray, +) +from finn.util.basic import roundup_to_integer_multiple + + +def to_external_tensor(init, w_dtype): + """Return an appropriately formatted and packed numpy byte array for given + external parameter tensor.""" + + weight_width = init.shape[1] * w_dtype.bitwidth() + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + hex_init = pack_innermost_dim_as_hex_string( + init, w_dtype, weight_width_padded, prefix="0x" + ) + ext_weight = np.array([], dtype=np.uint8) + for line in hex_init: + array_line = [ + x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x")) + ] + ext_weight = np.append(ext_weight, array_line) + + return ext_weight class MakePYNQDriver(Transformation): """Create PYNQ Python code to correctly interface the generated accelerator, including data packing/unpacking. Should be called - after conversion to HLS layers and folding, but prior to the creation of + after conversion to HLS layers, folding and the creation of dataflow partitions for correct operation. platform: one of ["zynq-iodma", "alveo"] @@ -123,6 +149,40 @@ class MakePYNQDriver(Transformation): i_tensor_shape_packed = i_tensor_dummy_packed.shape o_tensor_shape_packed = o_tensor_dummy_packed.shape + # generate external weights npy files + weights_dir = pynq_driver_dir + "/runtime_weights" + + os.makedirs(weights_dir) + idma_idx = 0 + ext_weight_dma_cnt = 0 + + for node in model.graph.node: + assert ( + node.op_type == "StreamingDataflowPartition" + ), "CreateDataflowPartition needs to be applied before driver generation" + + producer = model.find_producer(node.input[0]) + init_tensor = model.get_initializer(node.input[0]) + + if producer is None: # input dma? + idma_name = "idma" + str(idma_idx) + if init_tensor is not None: # input weights dma? + ext_weight_dma_cnt += 1 + w_dtype = model.get_tensor_datatype(node.input[0]) + init_external_tensor = to_external_tensor(init_tensor, w_dtype) + np.save( + weights_dir + "/" + idma_name + ".npy", init_external_tensor + ) + if self.platform != "alveo": + # Todo: add support in driver_base.py + warnings.warn( + "external_weights not yet supported for Zynq builds" + ) + else: + net_input_name = idma_name + + idma_idx += 1 + # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = template_driver.pynq_driver_template @@ -146,6 +206,8 @@ class MakePYNQDriver(Transformation): driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal)) driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) + driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name) + driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt)) with open(driver_py, "w") as f: f.write(driver) @@ -172,25 +234,35 @@ class MakePYNQDriver(Transformation): shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core") # generate weight files for runtime-writable layers - weights_dir = pynq_driver_dir + "/runtime_weights" - rt_layer_ind = 0 - os.makedirs(weights_dir) - for node in model.graph.node: - if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]: - node_inst = getCustomOp(node) - is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") - if is_rt_weights == 1: - fcl_w = model.get_initializer(node.input[1]) - w_filename = weights_dir + "/%d_%s.dat" % (rt_layer_ind, node.name) - node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename) - rt_layer_ind += 1 - elif node.op_type == "StreamingDataflowPartition": - warnings.warn( - """Please call MakePYNQDriver prior to - CreateDataflowPartition. Can only extract runtime-writable - weights from HLSCustomOp instances and not StreamingDataflowPartition. - """ - ) - else: - continue + + for sdp_ind, sdp_node in enumerate(model.graph.node): + assert sdp_node.op_type == "StreamingDataflowPartition" + # get dataflow model + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + dataflow_model = ModelWrapper(dataflow_model_filename) + rt_layer_ind = 0 + for node in dataflow_model.graph.node: + if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]: + node_inst = getCustomOp(node) + is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") + if is_rt_weights == 1: + fcl_w = dataflow_model.get_initializer(node.input[1]) + w_filename = weights_dir + "/%d_%d_%s.dat" % ( + sdp_ind, + rt_layer_ind, + node.name, + ) + node_inst.make_weight_file( + fcl_w, "decoupled_runtime", w_filename + ) + rt_layer_ind += 1 + elif node.op_type == "StreamingDataflowPartition": + warnings.warn( + """Nested StreamingDataflowPartition are not supported + """ + ) + else: + continue + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 1ac7ee178531e745bf68405d1ae9df35c0c216fb..3dab426ccf9bab73ddac83299bdc47f89ea46bdc 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -286,7 +286,10 @@ class MakeZYNQProject(Transformation): class ZynqBuild(Transformation): - """Best-effort attempt at building the accelerator for Zynq.""" + """Best-effort attempt at building the accelerator for Zynq. + It assumes the model has only fpgadataflow nodes + + """ def __init__(self, platform, period_ns, enable_debug=False): super().__init__() @@ -300,7 +303,6 @@ class ZynqBuild(Transformation): model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ - MakePYNQDriver(platform="zynq-iodma"), InsertIODMA(64), InsertDWC(), Floorplan(), @@ -335,6 +337,10 @@ class ZynqBuild(Transformation): model = model.transform( MakeZYNQProject(self.platform, enable_debug=self.enable_debug) ) + # set platform attribute for correct remote execution model.set_metadata_prop("platform", "zynq-iodma") + + # create driver + model = model.transform(MakePYNQDriver(platform="zynq-iodma")) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py index b595205714d8cb630816d2b42fe96640e49e506e..5265835dd2530a5c93ceefbef629a43d6f33de52 100644 --- a/src/finn/transformation/fpgadataflow/template_driver.py +++ b/src/finn/transformation/fpgadataflow/template_driver.py @@ -77,7 +77,9 @@ io_shape_dict = { "ishape_folded" : $INPUT_SHAPE_FOLDED$, "oshape_folded" : $OUTPUT_SHAPE_FOLDED$, "ishape_packed" : $INPUT_SHAPE_PACKED$, - "oshape_packed" : $OUTPUT_SHAPE_PACKED$ + "oshape_packed" : $OUTPUT_SHAPE_PACKED$, + "input_dma_name" : $INPUT_DMA_NAME$, + "number_of_external_weights": $EXT_WEIGHT_NUM$ } if __name__ == "__main__": diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index e52fb14b158a7927311d1b7e90067fea4bde6e27..0fe4276096852c08d0798be8e1ee715cc5769286 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -28,6 +28,7 @@ import os import subprocess +import json from finn.core.modelwrapper import ModelWrapper from finn.transformation.base import Transformation @@ -38,14 +39,17 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( ) from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.floorplan import Floorplan from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver -from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + RemoveUnusedTensors, +) from finn.util.basic import make_build_dir from finn.transformation.infer_data_layouts import InferDataLayouts from . import templates @@ -89,63 +93,47 @@ class CreateVitisXO(Transformation): _check_vitis_envvars() vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj") stitched_ip_dir = vivado_proj_dir + "/ip" + interfaces = json.loads(model.get_metadata_prop("vivado_stitch_ifnames")) args_string = [] - m_axis_idx = 0 - s_axis_idx = 0 + arg_id = 0 # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface # developed from instructions in UG1393 (v2019.2) and package_xo documentation # package_xo is responsible for generating the kernel xml - ifnames = eval(model.get_metadata_prop("vivado_stitch_ifnames")) assert ( - len(ifnames["axilite"]) <= 1 + len(interfaces["axilite"]) <= 1 ), "CreateVitisXO supports max 1 AXI lite interface" - if len(ifnames["axilite"]) == 1: - axilite_intf_name = ifnames["axilite"][0] - else: - axilite_intf_name = None - - for node in model.graph.node: - node_inst = getCustomOp(node) - arg_id = 0 - if node.op_type == "TLastMarker": - stream_width = node_inst.get_nodeattr("StreamWidth") - # add a stream input or output port, based on direction - if node_inst.get_nodeattr("Direction") == "in": - args_string.append( - "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint<%s>:0}" - % (str(arg_id), s_axis_idx, str(stream_width)) - ) - s_axis_idx += 1 - else: - args_string.append( - "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint<%s>:0}" - % (str(arg_id), m_axis_idx, str(stream_width)) + axilite_intf_name = None + if len(interfaces["axilite"]) == 1: + axilite_intf_name = interfaces["axilite"][0] + if len(interfaces["aximm"]) > 0: + args_string.append( + "{addr:1:%s:%s:0x8:0x10:ap_uint<%s>*:0}" + % ( + str(arg_id), + interfaces["aximm"][0][0], + str(interfaces["aximm"][0][1]), ) - m_axis_idx += 1 + ) arg_id += 1 - # add a axilite port if dynamic - # add a count parameter if dynamic - if node_inst.get_nodeattr("DynIters") == 1: - assert axilite_intf_name is not None - args_string.append( - "{numReps:0:%s:%s:0x4:0x10:uint:0}" - % (str(arg_id), axilite_intf_name) - ) - arg_id += 1 - elif node.op_type == "IODMA": - port_width = node_inst.get_nodeattr("intfWidth") - # add an address parameter - # add a count parameter args_string.append( - "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint<%s>*:0}" - % (str(arg_id), str(port_width)) + "{numReps:0:%s:%s:0x4:0x1C:uint:0}" + % (str(arg_id), axilite_intf_name) ) arg_id += 1 + else: args_string.append( - "{numReps:0:%s:%s:0x4:0x1C:uint:0}" + "{numReps:0:%s:%s:0x4:0x10:uint:0}" % (str(arg_id), axilite_intf_name) ) arg_id += 1 + for intf in interfaces["s_axis"] + interfaces["m_axis"]: + stream_width = intf[1] + stream_name = intf[0] + args_string.append( + "{%s:4:%s:%s:0x0:0x0:ap_uint<%s>:0}" + % (stream_name, str(arg_id), stream_name, str(stream_width)) + ) + arg_id += 1 # save kernel xml then run package_xo xo_name = self.ip_name + ".xo" @@ -342,6 +330,7 @@ class VitisLink(Transformation): class VitisBuild(Transformation): """Best-effort attempt at building the accelerator with Vitis. + It assumes the model has only fpgadataflow nodes fpga_part: string identifying the target FPGA period_ns: target clock period @@ -377,7 +366,6 @@ class VitisBuild(Transformation): model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ - MakePYNQDriver(platform="alveo"), InsertIODMA(512), InsertDWC(), ] @@ -399,9 +387,7 @@ class VitisBuild(Transformation): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) - kernel_model = kernel_model.transform( - InsertTLastMarker(both=True, external=False, dynamic=False) - ) + kernel_model = kernel_model.transform(RemoveUnusedTensors()) kernel_model = kernel_model.transform(GiveUniqueNodeNames()) kernel_model.save(dataflow_model_filename) kernel_model = kernel_model.transform( @@ -430,4 +416,6 @@ class VitisBuild(Transformation): # set platform attribute for correct remote execution model.set_metadata_prop("platform", "alveo") + #create driver + model = model.transform(MakePYNQDriver(platform="alveo")) return (model, False) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index ddea2dafce02c181a279d9c95759b97dee00a504..2823dec1fbce9f2e6a5f5f681cf403c205ee0a2d 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -591,6 +591,45 @@ class TestEnd2End: update_dashboard_data(topology, wbits, abits, "board", cfg["board"]) model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind)) + @pytest.mark.slow + @pytest.mark.vivado + @pytest.mark.vitis + @pytest.mark.parametrize("kind", ["zynq", "alveo"]) + def test_build_extweights(self, topology, wbits, abits, kind): + if "VITIS_PATH" not in os.environ: + pytest.skip("VITIS_PATH not set") + prev_chkpt_name = get_checkpoint_name( + topology, wbits, abits, "fifodepth_" + kind + ) + model = load_test_checkpoint_or_skip(prev_chkpt_name) + # select some FC layers, erase their implementation + # and set them to external weights + num_extw_layers = 0 + for node in model.graph.node: + if node.op_type == "StreamingFCLayer_Batch": + node_inst = getCustomOp(node) + simd = node_inst.get_nodeattr("SIMD") + pe = node_inst.get_nodeattr("PE") + # skip layers which require very large IODMA DWCs + if (512 % simd) != 0 or ((pe * simd) % 32) != 0: + continue + node_inst.set_nodeattr("code_gen_dir_ipgen", "") + node_inst.set_nodeattr("ipgen_path", "") + node_inst.set_nodeattr("mem_mode", "external") + num_extw_layers += 1 + if num_extw_layers == 0: + pytest.skip("No layers suitable for external weights") + # build + cfg = get_build_env(kind, target_clk_ns) + model = model.transform(cfg["build_fxn"]) + # check list of interfaces + # model = model.transform(AnnotateResources("synth")) + # synth_dct = eval(model.get_metadata_prop("res_total_top_synth")) + # for (k, v) in synth_dct.items(): + # update_dashboard_data(topology, wbits, abits, k, v) + # update_dashboard_data(topology, wbits, abits, "board", cfg["board"]) + model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind+"_extweights")) + @pytest.mark.parametrize("kind", ["zynq", "alveo"]) def test_deploy(self, topology, wbits, abits, kind): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + kind)