diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index b206e00a2eb6da1d76ccf57c078b16f61868a98c..bd938f17411ee42e94e95e02776ad8e973ea10fa 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -114,9 +114,9 @@ default_build_dataflow_steps = [ "step_set_fifo_depths", "step_create_stitched_ip", "step_measure_rtlsim_performance", - "step_make_pynq_driver", "step_out_of_context_synthesis", "step_synthesize_bitfile", + "step_make_pynq_driver", "step_deployment_package", ] diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py index 05e41a48a8f4cb34616bf06c01b652afb9ae4257..38940ccb94f11fe49af5f49ee020f150326a026c 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py @@ -63,7 +63,7 @@ class AddStreams_Batch(HLSCustomOp): ishape = tuple(vecs + [ich]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") assert ich % pe == 0, "PE must divide NumChannels" @@ -362,5 +362,5 @@ class AddStreams_Batch(HLSCustomOp): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() - intf_names["s_axis"] = ["in0_V_V", "in1_V_V"] + intf_names["s_axis"].append(("in1_V_V", self.get_instream_width_padded())) return intf_names diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index 10a8051730217b56873b5a53c0803e3b90dada90..73da77bd3f940cee5ffd10fcfc43571f1a612eb4 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -312,7 +312,8 @@ class DuplicateStreams_Batch(HLSCustomOp): def docompute(self): self.code_gen_dict["$DOCOMPUTE$"] = [ """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format( - self.get_outstream_width(), self.get_number_output_values() // 2, + self.get_outstream_width(), + self.get_number_output_values() // 2, ) ] @@ -378,5 +379,8 @@ class DuplicateStreams_Batch(HLSCustomOp): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() - intf_names["m_axis"] = ["out0_V_V", "out1_V_V"] + intf_names["m_axis"] = [ + ("out0_V_V", self.get_outstream_width_padded()), + ("out1_V_V", self.get_outstream_width_padded()), + ] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 02912b2d5f45b3bab0eaca13ee0a0bf19bf9cfca..2ab070b2fdc059a554930345a81abc368c29bfa7 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -123,15 +123,16 @@ class HLSCustomOp(CustomOp): """Return a dict of names of input and output interfaces. The keys reflect the protocols each interface implements: 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. - Values are lists of names: - 's_axis' names correspond to the list of node inputs in order, - 'm_axis' names correspond to the list of node outputs in order' + Values are lists of tuples (axis, aximm) or names (axilite): + 'axis' tuples correspond to the list of node inputs in order, + each tuple is (interface_name, interface_width_bits). + axilite always assumed to be 32 bits and is not tuple (name only). Each block must have at most one aximm and one axilite.""" intf_names = {} intf_names["clk"] = ["ap_clk"] intf_names["rst"] = ["ap_rst_n"] - intf_names["s_axis"] = ["in0_V_V"] - intf_names["m_axis"] = ["out_V_V"] + intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())] + intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())] intf_names["aximm"] = [] intf_names["axilite"] = [] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index a6cddcc4aeb45957c16249cd57f122fe5e58b85a..857496a2614894588ebf065db3e384cf2cecf106 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -355,11 +355,9 @@ class IODMA(HLSCustomOp): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() if self.get_nodeattr("direction") == "out": - intf_names["s_axis"] = ["in0_V_V"] intf_names["m_axis"] = [] else: intf_names["s_axis"] = [] - intf_names["m_axis"] = ["out_V_V"] intf_names["axilite"] = ["s_axi_control"] - intf_names["aximm"] = ["m_axi_gmem"] + intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("intfWidth"))] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py index 23c1779a27c123583c0c8af5f53d022d03e78126..4d84b74dce001fca769ed2850a8f718ac942f14c 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py @@ -395,8 +395,8 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 8868002c9e2cb8726eeb573e104140e3e1a61d27..3cc01ade73fc6b735509f2839e5c10785a8b9f54 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -444,12 +444,24 @@ class StreamingFCLayer_Batch(HLSCustomOp): single_pe_w = simd * weight_bits return max([weightstream, max_of_io, single_pe_w]) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") sf = mw // simd + nf = mh // pe vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [sf, simd]) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple(vecs + [sf, simd]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) + else: + raise Exception("Undefined input shape for requested input") + return folded_input_shape def get_folded_output_shape(self): @@ -1253,8 +1265,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) @@ -1348,8 +1360,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const": - # base class impl sufficient for const mode + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for StreamingFCLayer") @@ -1359,7 +1371,9 @@ class StreamingFCLayer_Batch(HLSCustomOp): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") if mem_mode == "external": - intf_names["s_axis"] = ["in0_V_V", "weights_V_V"] + intf_names["s_axis"].append( + ("weights_V_V", self.get_weightstream_width_padded()) + ) if mem_mode == "decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index fb41bceca09fe544bd729537b1af726c9c43d290..133a869b28cf9968a719e243a3266dfb25b637ba 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -343,8 +343,8 @@ class StreamingFIFO(HLSCustomOp): # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index 30374a7d97f4d2189e142a9b7b6e44a5abbb46b0..0b248c15035a2b685ebfb024c8a944a6ea6c65bf 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -825,8 +825,8 @@ class Thresholding_Batch(HLSCustomOp): # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py index 9ec03ea5dd726b49b157a92addef05f85f02b644..f430402538b873c3db7c93ceca79d324d878571d 100644 --- a/src/finn/qnn-data/templates/driver/driver_base.py +++ b/src/finn/qnn-data/templates/driver/driver_base.py @@ -86,25 +86,78 @@ class FINNExampleOverlay(Overlay): self.batch_size = batch_size self.fclk_mhz = fclk_mhz if self.platform == "alveo": - self.idma = self.idma0 + if "input_dma_name" in io_shape_dict.keys(): + self.idma = getattr(self, io_shape_dict["input_dma_name"]) + else: + self.idma = self.idma0 self.odma = self.odma0 self.odma_handle = None elif self.platform == "zynq-iodma": - self.idma = self.idma0 + if "input_dma_name" in io_shape_dict.keys(): + self.idma = getattr(self, io_shape_dict["input_dma_name"]) + else: + self.idma = self.idma0 self.odma = self.odma0 # set the clock frequency as specified by user during transformations if self.fclk_mhz > 0: Clocks.fclk0_mhz = self.fclk_mhz else: raise ValueError("Supported platforms are zynq-iodma alveo") - # load any runtime weights + # load any external + runtime weights + self.load_external_weights() self.load_runtime_weights() + def load_external_weights(self): + """Load any existing external (DRAM) weights from the specified dir into the + appropriate layer of the accelerator. Note that this must be enabled + during the accelerator build process. The weights directory + is specified as the class member ``runtime_weight_dir``. External (DRAM) + weights are one .npy file per layer. + """ + + self.external_weights = [] + w_filenames = [] + if not os.path.isdir(self.runtime_weight_dir): + return + for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir): + w_filenames.extend(filenames) + + tmp_weight_dict = {} + + for w_filename in w_filenames: + if w_filename.endswith(".npy"): + weight_tensor = np.load(self.runtime_weight_dir + "/" + w_filename) + else: + continue + + idma_name = w_filename.split(".")[0] + tmp_weight_dict[idma_name] = weight_tensor + + for idma_name in tmp_weight_dict.keys(): + if idma_name in self.ip_dict.keys(): + iwdma = getattr(self, idma_name) + weight_tensor = tmp_weight_dict[idma_name] + weight_buf = allocate(weight_tensor.shape, dtype=np.uint8) + weight_buf[:] = weight_tensor + # weight_buf.sync_to_device() + weight_buf.flush() + + self.external_weights += [(iwdma, weight_buf, idma_name)] + + if "number_of_external_weights" in self._io_shape_dict: + hw_ext_weights = self._io_shape_dict["number_of_external_weights"] + assert len(self.external_weights) == hw_ext_weights, ( + "Number of hardware external weights and number of external " + + "weight tensors available do not match. \n" + + "Is runtime_weight_dir pointing to the correct folder?" + ) + def load_runtime_weights(self, flush_accel=True, verify=True): - """Load any existing runtime weights from the specified dir into the + """Load any existing runtime-writable weights from the specified dir into the appropriate layer of the accelerator. Note that this must be enabled during the accelerator build process. The runtime weights directory - is specified as the class member ``runtime_weight_dir``. + is specified as the class member ``runtime_weight_dir``. Runtime-writable + weights are provided as one .dat file per layer. Parameters ---------- @@ -124,18 +177,25 @@ class FINNExampleOverlay(Overlay): if w_filename.endswith(".dat"): with open(self.runtime_weight_dir + "/" + w_filename, "r") as f: dat = f.read() + else: + continue layer_w = np.fromiter( [int(x, 16) for x in dat.strip().split()], dtype=np.uint32 ) - layer_ind = int(w_filename.split("_")[0]) - rt_weight_dict[layer_ind] = layer_w - for layer_ind in rt_weight_dict.keys(): - cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind + sdp_ind = int(w_filename.split("_")[0]) + layer_ind = int(w_filename.split("_")[1]) + rt_weight_dict[(sdp_ind, layer_ind)] = layer_w + for sdp_ind, layer_ind in rt_weight_dict.keys(): + cand_if_name = "StreamingDataflowPartition_%d/s_axilite_%d" % ( + sdp_ind, + layer_ind, + ) if cand_if_name in self.ip_dict.keys(): layer_mmio = getattr( - self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind + getattr(self, "StreamingDataflowPartition_%d" % sdp_ind), + "s_axilite_%d" % layer_ind, ).mmio - layer_w = rt_weight_dict[layer_ind] + layer_w = rt_weight_dict[(sdp_ind, layer_ind)] layer_mmio.write_mm(0, layer_w.tobytes()) if verify: new_w = np.copy(layer_mmio.array[: layer_w.shape[0]]) @@ -280,6 +340,10 @@ class FINNExampleOverlay(Overlay): if self.platform == "zynq-iodma": assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle" # manually launch IODMAs since signatures are missing + for iwdma, iwbuf, iwdma_name in self.external_weights: + iwdma.write(0x10, iwbuf.device_address) + iwdma.write(0x1C, batch_size) + iwdma.write(0x00, 1) self.idma.write(0x10, self.ibuf_packed_device.device_address) self.idma.write(0x1C, batch_size) self.odma.write(0x10, self.obuf_packed_device.device_address) @@ -289,6 +353,8 @@ class FINNExampleOverlay(Overlay): elif self.platform == "alveo": assert self.odma_handle is None, "Output DMA is already running" self.idma.start(self.ibuf_packed_device, batch_size) + for iwdma, iwbuf, iwdma_name in self.external_weights: + iwdma.start(iwbuf, batch_size) self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size) else: raise Exception("Unrecognized platform: %s" % self.platform) @@ -340,6 +406,10 @@ class FINNExampleOverlay(Overlay): res["DRAM_out_bandwidth[Mb/s]"] = ( np.prod(self.oshape_packed) * 0.000001 / runtime ) + for iwdma, iwbuf, iwdma_name in self.external_weights: + res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = ( + self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime + ) if self.platform == "zynq-iodma": res["fclk[mhz]"] = Clocks.fclk0_mhz elif self.platform == "alveo": diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json new file mode 100644 index 0000000000000000000000000000000000000000..299a8be815aeaba70c0f41e4b1b3252b77c6f042 --- /dev/null +++ b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json @@ -0,0 +1,30 @@ +{ + "Defaults": {}, + "Thresholding_Batch_0": { + "PE": 49, + "ram_style": "distributed" + }, + "StreamingFCLayer_Batch_0": { + "PE": 16, + "SIMD": 49, + "ram_style": "block" + }, + "StreamingFCLayer_Batch_1": { + "PE": 8, + "SIMD": 8, + "mem_mode": "external" + }, + "StreamingFCLayer_Batch_2": { + "PE": 8, + "SIMD": 8, + "mem_mode": "external" + }, + "StreamingFCLayer_Batch_3": { + "PE": 10, + "SIMD": 8, + "ram_style": "distributed" + }, + "LabelSelect_Batch_0": { + "PE": 1 + } + } diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 19fa5c603bfafe16ed151e10fa8eb11a79106ede..738f2000a1929024d3808dd7bad0267338b51659 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -29,17 +29,43 @@ import os import warnings import subprocess +import json from finn.transformation.base import Transformation -from finn.util.basic import get_by_name, make_build_dir, is_finn_op +from finn.util.basic import make_build_dir, get_num_default_workers +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.custom_op.registry import getCustomOp -from finn.util.basic import get_num_default_workers import multiprocessing as mp from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) +def is_external_input(model, node, i): + # indicate whether input i of node should be made external + # True only if input is unconnected and has no initializer + # Only esception is second input of FC layers when mem_mode is external + node_inst = getCustomOp(node) + producer = model.find_producer(node.input[i]) + if producer is None: + if model.get_initializer(node.input[i]) is None: + return True + else: + if node.op_type == "StreamingFCLayer_Batch": + if node_inst.get_nodeattr("mem_mode") == "external": + return True + return False + + +def is_external_output(model, node, i): + # indicate whether output i of node should be made external + # True only if output is unconnected + consumers = model.find_consumers(node.output[i]) + if consumers is None: + return True + return False + + class CreateStitchedIP(Transformation): """Create a Vivado IP Block Design project from all the generated IPs of a graph. All nodes in the graph must have the fpgadataflow backend attribute, @@ -134,21 +160,24 @@ class CreateStitchedIP(Transformation): if len(aximm_intf_name) != 0: self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" - % (inst_name, aximm_intf_name[0]) + % (inst_name, aximm_intf_name[0][0]) ) self.connect_cmds.append( "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]" ) - self.intf_names["aximm"] = ["m_axi_gmem0"] + self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])] assert self.has_aximm is False, "Currently limited to one AXI-MM interface" self.has_aximm = True - def connect_m_axis_external(self, node): + def connect_m_axis_external(self, node, idx=None): inst_name = node.name node_inst = getCustomOp(node) output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"] # make output axis external - for output_intf_name in output_intf_names: + for i in range(len(output_intf_names)): + if idx is not None and idx != i: + continue + output_intf_name = output_intf_names[i][0] self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, output_intf_name) @@ -158,15 +187,20 @@ class CreateStitchedIP(Transformation): % (self.m_axis_idx, output_intf_name) ) self.has_m_axis = True - self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx) + self.intf_names["m_axis"].append( + ("m_axis_%d" % self.m_axis_idx, output_intf_names[i][1]) + ) self.m_axis_idx += 1 - def connect_s_axis_external(self, node): + def connect_s_axis_external(self, node, idx=None): inst_name = node.name node_inst = getCustomOp(node) input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"] # make input axis external - for input_intf_name in input_intf_names: + for i in range(len(input_intf_names)): + if idx is not None and idx != i: + continue + input_intf_name = input_intf_names[i][0] self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, input_intf_name) @@ -176,7 +210,9 @@ class CreateStitchedIP(Transformation): % (self.s_axis_idx, input_intf_name) ) self.has_s_axis = True - self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx) + self.intf_names["s_axis"].append( + ("s_axis_%d" % self.s_axis_idx, input_intf_names[i][1]) + ) self.s_axis_idx += 1 def apply(self, model): @@ -187,70 +223,38 @@ class CreateStitchedIP(Transformation): ip_dirs.append("/workspace/finn/finn-rtllib/memstream") # ensure that all nodes are fpgadataflow, and that IPs are generated for node in model.graph.node: - assert is_finn_op(node.domain), "Found non-FINN node" - backend_attribute = get_by_name(node.attribute, "backend") - assert backend_attribute is not None, "Backend node attribute is not set." - backend_value = backend_attribute.s.decode("UTF-8") - assert ( - backend_value == "fpgadataflow" - ), """Backend node attribute is not - set to "fpgadataflow".""" + assert is_fpgadataflow_node( + node + ), "All nodes must be FINN fpgadataflow nodes." node_inst = getCustomOp(node) ip_dir_value = node_inst.get_nodeattr("ip_path") assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist." ip_dirs += [ip_dir_value] self.create_cmds += node_inst.code_generation_ipi() - my_producer = model.find_producer(node.input[0]) self.connect_clk_rst(node) self.connect_axi(node) - if my_producer is None: - # first node in graph - self.connect_s_axis_external(node) - if node.op_type == "TLastMarker": - assert ( - node_inst.get_nodeattr("Direction") == "in" - ), """Output TLastMarker incorrect direction""" - elif node.op_type == "IODMA" and len(model.graph.node) != 1: - # don't apply this check for a 1-node partition - assert ( - node_inst.get_nodeattr("direction") == "in" - ), """Input DMA incorrect direction""" - else: - # intermediate node - # wire up input(s) to previous node output(s) - # foreach input - # find producer - # find index of producer output connected to our target input - # get names of hdl interfaces for input and producer output - # issue a TCL directive to connect input to output - # if FC layer with mode "decoupled", add a streamer on input 1 - for i in range(len(node.input)): + for i in range(len(node.input)): + if is_external_input(model, node, i): + self.connect_s_axis_external(node, idx=i) + else: producer = model.find_producer(node.input[i]) if producer is None: continue j = list(producer.output).index(node.input[i]) src_intf_name = getCustomOp( producer - ).get_verilog_top_module_intf_names()["m_axis"][j] + ).get_verilog_top_module_intf_names()["m_axis"][j][0] dst_intf_name = node_inst.get_verilog_top_module_intf_names()[ "s_axis" - ][i] + ][i][0] self.connect_cmds.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " "[get_bd_intf_pins %s/%s]" % (producer.name, src_intf_name, node.name, dst_intf_name) ) - if model.find_consumers(node.output[0]) is None: - # last node in graph - self.connect_m_axis_external(node) - if node.op_type == "TLastMarker": - assert ( - node_inst.get_nodeattr("Direction") == "out" - ), """Output TLastMarker incorrect direction""" - elif node.op_type == "IODMA" and len(model.graph.node) != 1: - assert ( - node_inst.get_nodeattr("direction") == "out" - ), """Output DMA incorrect direction""" + for i in range(len(node.output)): + if is_external_output(model, node, i): + self.connect_m_axis_external(node, idx=i) # create a temporary folder for the project prjname = "finn_vivado_stitch_proj" @@ -316,7 +320,7 @@ class CreateStitchedIP(Transformation): block_library = "finn" block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name) model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv) - model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names)) + model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names)) tcl.append( ( "ipx::package_project -root_dir %s/ip -vendor %s " diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 0f2b8ef6a4c0858cd98218538930c97c6df2ad9d..c8df80659d30e1855fc658bad83c3fe9bccb9bf9 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -44,8 +44,8 @@ class InsertDWC(Transformation): for n in graph.node: node_ind += 1 if _suitable_node(n): - for n_output in n.output: - consumers = model.find_consumers(n_output) + for output_name in n.output: + consumers = model.find_consumers(output_name) if consumers is None: continue if len(consumers) > 1: @@ -59,7 +59,22 @@ class InsertDWC(Transformation): n0 = getCustomOp(n) n1 = getCustomOp(consumer) n0_out_shape = n0.get_folded_output_shape() - n1_in_shape = n1.get_folded_input_shape() + + # If FC and external mem, it could be connected to input 1 + if ( + consumer.op_type == "StreamingFCLayer_Batch" + and n1.get_nodeattr("mem_mode") == "external" + ): + # get input idx + in_idx = None + for idx, n_input in enumerate(consumer.input): + if output_name == n_input: + in_idx = idx + assert in_idx is not None, "Malformed model" + n1_in_shape = n1.get_folded_input_shape(in_idx) + else: + n1_in_shape = n1.get_folded_input_shape() + if n0_out_shape[-1] != n1_in_shape[-1]: graph_modified = True # determine dwc inwidth @@ -82,7 +97,7 @@ class InsertDWC(Transformation): dwc_node = oh.make_node( "StreamingDataWidthConverter_Batch", - [n_output], + [output_name], [dwc_output_tensor.name], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", @@ -96,7 +111,7 @@ class InsertDWC(Transformation): # set dwc output tensor as new input tensor of second node for idx, inp in enumerate(consumer.input): - if inp == n_output: + if inp == output_name: consumer.input[idx] = dwc_output_tensor.name return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 67143547557a9b24b311e69cff6f885f8745cd3c..27055a4fd29dba3849c0e4a889f27802f8c36081 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -33,7 +33,6 @@ from finn.util.basic import get_by_name from finn.custom_op.registry import getCustomOp from finn.transformation.base import Transformation from finn.transformation.general import SortGraph -import finn.core.data_layout as DataLayout import math import numpy as np @@ -48,6 +47,45 @@ class InsertIODMA(Transformation): ), "max_intfwidth must be a power of 2" self.max_intfwidth = max_intfwidth + def get_mem_init(self, weights, pe, simd): + """ + Returns matrix ready for pack_innermost_dim_as_hex_string with + reverse=False (finn.util.data_packing) to return the memory init file + little endian packed. + That is, get_mem_init returns: + elem(pe,simd) + addr = 0: [(pe-1,simd-1),(pe-1,simd-2),...(0,1),(0,0)] + addr = 1: [(pe-1,simd*2-1),.......(0,simd+1),(0,simd)] + . + """ + + # TODO: refactor this into streamingfclayer_batch.py, could go into + # make_weight_file except it doesn't write a file but returns a npy + # array instead + w_shape = weights.shape + assert len(w_shape) == 2, "weights withincorrect number of dims" + inp_w, out_w = w_shape + + assert out_w % pe == 0, "Malformed weight matrix" + assert inp_w % simd == 0, "Malformed weight matrix" + reshaped_w = np.zeros(inp_w * out_w).reshape(-1, pe * simd) + + addr = 0 + for fr in range(out_w // pe): + for fc in range(inp_w // simd): + w0_lower = fc * simd + w0_upper = (fc + 1) * simd + w1_lower = fr * pe + w1_upper = (fr + 1) * pe + tile = weights[w0_lower:w0_upper, w1_lower:w1_upper] + for p in range(pe): + rw0_lower = p * simd + rw0_upper = (p + 1) * simd + reshaped_w[addr, rw0_lower:rw0_upper] = tile[:, p].transpose() + addr += 1 + reshaped_w = np.flip(reshaped_w, axis=-1) + return reshaped_w + def apply(self, model): # only makes sense for a pure fpgadataflow graph -- so we check! all_nodes = list(model.graph.node) @@ -59,8 +97,7 @@ class InsertIODMA(Transformation): fc_extw_nodes = list( filter( lambda x: x.op_type == "StreamingFCLayer_Batch" - and get_by_name(x.attribute, "mem_mode") is not None - and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external" + and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, ) @@ -78,11 +115,6 @@ class InsertIODMA(Transformation): return (model, False) else: if final_node.op_type != "IODMA": - # check if tensor is NHWC - assert ( - model.get_tensor_layout(graph_out_name) == DataLayout.NHWC - or model.get_tensor_layout(graph_out_name) == DataLayout.NC - ), "Data layout of output tensor must be NHWC or NC" out_shape = model.get_tensor_shape(graph_out_name) out_dtype = model.get_tensor_datatype(graph_out_name) final_node_inst = getCustomOp(final_node) @@ -123,11 +155,6 @@ class InsertIODMA(Transformation): ) model.graph.node.append(dma_node) if first_node.op_type != "IODMA": - # check if tensor is NHWC - assert ( - model.get_tensor_layout(graph_in_name) == DataLayout.NHWC - or model.get_tensor_layout(graph_in_name) == DataLayout.NC - ), "Data layout of input tensor must be NHWC or NC" in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) first_node_inst = getCustomOp(first_node) @@ -168,11 +195,7 @@ class InsertIODMA(Transformation): ) model.graph.node.insert(0, dma_node) for fc_node in fc_extw_nodes: - # check if tensor is NHWC - assert ( - model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC - or model.get_tensor_layout(graph_in_name) == DataLayout.NC - ), "Data layout of tensors must be NHWC or NC" + fc_inst = getCustomOp(fc_node) fc_w_name = fc_node.input[1] w_shape = model.get_tensor_shape(fc_w_name) w_dtype = model.get_tensor_datatype(fc_w_name) @@ -185,21 +208,24 @@ class InsertIODMA(Transformation): # calculate width of stream output from DMA pe = get_by_name(fc_node.attribute, "PE").i simd = get_by_name(fc_node.attribute, "SIMD").i - assert pe * simd == w_shape[0], "Malformed weight matrix" - streamWidth = simd * pe * w_dtype.bitwidth() + streamWidth = fc_inst.get_weightstream_width_padded() # make new buffer + W = model.get_initializer(fc_w_name) + iodma_mem = self.get_mem_init(W, pe, simd) + model.set_initializer(fc_w_name, iodma_mem) + fc_node_in = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape + model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape ) model.graph.value_info.append(fc_node_in) model.set_tensor_datatype(fc_node_in.name, w_dtype) - model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name)) + model.set_initializer(fc_node_in.name, W) dma_node = oh.make_node( "IODMA", [fc_w_name], [fc_node_in.name], - numInputVectors=[w_shape[1]], - NumChannels=w_shape[0], + numInputVectors=[iodma_mem.shape[0]], + NumChannels=pe * simd, dataType=str(w_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 42f18d9a812d2db2119351dabfbb38e68c33194e..6ab12548abbcbe00496101bd146b2c9b873204c8 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -38,12 +38,38 @@ import warnings import pkg_resources as pk from . import template_driver from finn.core.modelwrapper import ModelWrapper +import numpy as np + +from finn.util.data_packing import ( + pack_innermost_dim_as_hex_string, + hexstring2npbytearray, +) +from finn.util.basic import roundup_to_integer_multiple + + +def to_external_tensor(init, w_dtype): + """Return an appropriately formatted and packed numpy byte array for given + external parameter tensor.""" + + weight_width = init.shape[1] * w_dtype.bitwidth() + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + hex_init = pack_innermost_dim_as_hex_string( + init, w_dtype, weight_width_padded, prefix="0x" + ) + ext_weight = np.array([], dtype=np.uint8) + for line in hex_init: + array_line = [ + x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x")) + ] + ext_weight = np.append(ext_weight, array_line) + + return ext_weight class MakePYNQDriver(Transformation): """Create PYNQ Python code to correctly interface the generated accelerator, including data packing/unpacking. Should be called - after conversion to HLS layers and folding, but prior to the creation of + after conversion to HLS layers, folding and the creation of dataflow partitions for correct operation. platform: one of ["zynq-iodma", "alveo"] @@ -123,6 +149,35 @@ class MakePYNQDriver(Transformation): i_tensor_shape_packed = i_tensor_dummy_packed.shape o_tensor_shape_packed = o_tensor_dummy_packed.shape + # generate external weights npy files + weights_dir = pynq_driver_dir + "/runtime_weights" + + os.makedirs(weights_dir) + idma_idx = 0 + ext_weight_dma_cnt = 0 + + for node in model.graph.node: + assert ( + node.op_type == "StreamingDataflowPartition" + ), "CreateDataflowPartition needs to be applied before driver generation" + + producer = model.find_producer(node.input[0]) + init_tensor = model.get_initializer(node.input[0]) + + if producer is None: # input dma? + idma_name = "idma" + str(idma_idx) + if init_tensor is not None: # input weights dma? + ext_weight_dma_cnt += 1 + w_dtype = model.get_tensor_datatype(node.input[0]) + init_external_tensor = to_external_tensor(init_tensor, w_dtype) + np.save( + weights_dir + "/" + idma_name + ".npy", init_external_tensor + ) + else: + net_input_name = idma_name + + idma_idx += 1 + # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = template_driver.pynq_driver_template @@ -146,6 +201,8 @@ class MakePYNQDriver(Transformation): driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal)) driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) + driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name) + driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt)) with open(driver_py, "w") as f: f.write(driver) @@ -172,25 +229,35 @@ class MakePYNQDriver(Transformation): shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core") # generate weight files for runtime-writable layers - weights_dir = pynq_driver_dir + "/runtime_weights" - rt_layer_ind = 0 - os.makedirs(weights_dir) - for node in model.graph.node: - if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]: - node_inst = getCustomOp(node) - is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") - if is_rt_weights == 1: - fcl_w = model.get_initializer(node.input[1]) - w_filename = weights_dir + "/%d_%s.dat" % (rt_layer_ind, node.name) - node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename) - rt_layer_ind += 1 - elif node.op_type == "StreamingDataflowPartition": - warnings.warn( - """Please call MakePYNQDriver prior to - CreateDataflowPartition. Can only extract runtime-writable - weights from HLSCustomOp instances and not StreamingDataflowPartition. - """ - ) - else: - continue + + for sdp_ind, sdp_node in enumerate(model.graph.node): + assert sdp_node.op_type == "StreamingDataflowPartition" + # get dataflow model + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + dataflow_model = ModelWrapper(dataflow_model_filename) + rt_layer_ind = 0 + for node in dataflow_model.graph.node: + if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]: + node_inst = getCustomOp(node) + is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") + if is_rt_weights == 1: + fcl_w = dataflow_model.get_initializer(node.input[1]) + w_filename = weights_dir + "/%d_%d_%s.dat" % ( + sdp_ind, + rt_layer_ind, + node.name, + ) + node_inst.make_weight_file( + fcl_w, "decoupled_runtime", w_filename + ) + rt_layer_ind += 1 + elif node.op_type == "StreamingDataflowPartition": + warnings.warn( + """Nested StreamingDataflowPartition are not supported + """ + ) + else: + continue + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 1ac7ee178531e745bf68405d1ae9df35c0c216fb..3dab426ccf9bab73ddac83299bdc47f89ea46bdc 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -286,7 +286,10 @@ class MakeZYNQProject(Transformation): class ZynqBuild(Transformation): - """Best-effort attempt at building the accelerator for Zynq.""" + """Best-effort attempt at building the accelerator for Zynq. + It assumes the model has only fpgadataflow nodes + + """ def __init__(self, platform, period_ns, enable_debug=False): super().__init__() @@ -300,7 +303,6 @@ class ZynqBuild(Transformation): model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ - MakePYNQDriver(platform="zynq-iodma"), InsertIODMA(64), InsertDWC(), Floorplan(), @@ -335,6 +337,10 @@ class ZynqBuild(Transformation): model = model.transform( MakeZYNQProject(self.platform, enable_debug=self.enable_debug) ) + # set platform attribute for correct remote execution model.set_metadata_prop("platform", "zynq-iodma") + + # create driver + model = model.transform(MakePYNQDriver(platform="zynq-iodma")) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py index b595205714d8cb630816d2b42fe96640e49e506e..5265835dd2530a5c93ceefbef629a43d6f33de52 100644 --- a/src/finn/transformation/fpgadataflow/template_driver.py +++ b/src/finn/transformation/fpgadataflow/template_driver.py @@ -77,7 +77,9 @@ io_shape_dict = { "ishape_folded" : $INPUT_SHAPE_FOLDED$, "oshape_folded" : $OUTPUT_SHAPE_FOLDED$, "ishape_packed" : $INPUT_SHAPE_PACKED$, - "oshape_packed" : $OUTPUT_SHAPE_PACKED$ + "oshape_packed" : $OUTPUT_SHAPE_PACKED$, + "input_dma_name" : $INPUT_DMA_NAME$, + "number_of_external_weights": $EXT_WEIGHT_NUM$ } if __name__ == "__main__": diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index e52fb14b158a7927311d1b7e90067fea4bde6e27..0fe4276096852c08d0798be8e1ee715cc5769286 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -28,6 +28,7 @@ import os import subprocess +import json from finn.core.modelwrapper import ModelWrapper from finn.transformation.base import Transformation @@ -38,14 +39,17 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( ) from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.floorplan import Floorplan from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver -from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + RemoveUnusedTensors, +) from finn.util.basic import make_build_dir from finn.transformation.infer_data_layouts import InferDataLayouts from . import templates @@ -89,63 +93,47 @@ class CreateVitisXO(Transformation): _check_vitis_envvars() vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj") stitched_ip_dir = vivado_proj_dir + "/ip" + interfaces = json.loads(model.get_metadata_prop("vivado_stitch_ifnames")) args_string = [] - m_axis_idx = 0 - s_axis_idx = 0 + arg_id = 0 # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface # developed from instructions in UG1393 (v2019.2) and package_xo documentation # package_xo is responsible for generating the kernel xml - ifnames = eval(model.get_metadata_prop("vivado_stitch_ifnames")) assert ( - len(ifnames["axilite"]) <= 1 + len(interfaces["axilite"]) <= 1 ), "CreateVitisXO supports max 1 AXI lite interface" - if len(ifnames["axilite"]) == 1: - axilite_intf_name = ifnames["axilite"][0] - else: - axilite_intf_name = None - - for node in model.graph.node: - node_inst = getCustomOp(node) - arg_id = 0 - if node.op_type == "TLastMarker": - stream_width = node_inst.get_nodeattr("StreamWidth") - # add a stream input or output port, based on direction - if node_inst.get_nodeattr("Direction") == "in": - args_string.append( - "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint<%s>:0}" - % (str(arg_id), s_axis_idx, str(stream_width)) - ) - s_axis_idx += 1 - else: - args_string.append( - "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint<%s>:0}" - % (str(arg_id), m_axis_idx, str(stream_width)) + axilite_intf_name = None + if len(interfaces["axilite"]) == 1: + axilite_intf_name = interfaces["axilite"][0] + if len(interfaces["aximm"]) > 0: + args_string.append( + "{addr:1:%s:%s:0x8:0x10:ap_uint<%s>*:0}" + % ( + str(arg_id), + interfaces["aximm"][0][0], + str(interfaces["aximm"][0][1]), ) - m_axis_idx += 1 + ) arg_id += 1 - # add a axilite port if dynamic - # add a count parameter if dynamic - if node_inst.get_nodeattr("DynIters") == 1: - assert axilite_intf_name is not None - args_string.append( - "{numReps:0:%s:%s:0x4:0x10:uint:0}" - % (str(arg_id), axilite_intf_name) - ) - arg_id += 1 - elif node.op_type == "IODMA": - port_width = node_inst.get_nodeattr("intfWidth") - # add an address parameter - # add a count parameter args_string.append( - "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint<%s>*:0}" - % (str(arg_id), str(port_width)) + "{numReps:0:%s:%s:0x4:0x1C:uint:0}" + % (str(arg_id), axilite_intf_name) ) arg_id += 1 + else: args_string.append( - "{numReps:0:%s:%s:0x4:0x1C:uint:0}" + "{numReps:0:%s:%s:0x4:0x10:uint:0}" % (str(arg_id), axilite_intf_name) ) arg_id += 1 + for intf in interfaces["s_axis"] + interfaces["m_axis"]: + stream_width = intf[1] + stream_name = intf[0] + args_string.append( + "{%s:4:%s:%s:0x0:0x0:ap_uint<%s>:0}" + % (stream_name, str(arg_id), stream_name, str(stream_width)) + ) + arg_id += 1 # save kernel xml then run package_xo xo_name = self.ip_name + ".xo" @@ -342,6 +330,7 @@ class VitisLink(Transformation): class VitisBuild(Transformation): """Best-effort attempt at building the accelerator with Vitis. + It assumes the model has only fpgadataflow nodes fpga_part: string identifying the target FPGA period_ns: target clock period @@ -377,7 +366,6 @@ class VitisBuild(Transformation): model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ - MakePYNQDriver(platform="alveo"), InsertIODMA(512), InsertDWC(), ] @@ -399,9 +387,7 @@ class VitisBuild(Transformation): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) - kernel_model = kernel_model.transform( - InsertTLastMarker(both=True, external=False, dynamic=False) - ) + kernel_model = kernel_model.transform(RemoveUnusedTensors()) kernel_model = kernel_model.transform(GiveUniqueNodeNames()) kernel_model.save(dataflow_model_filename) kernel_model = kernel_model.transform( @@ -430,4 +416,6 @@ class VitisBuild(Transformation): # set platform attribute for correct remote execution model.set_metadata_prop("platform", "alveo") + #create driver + model = model.transform(MakePYNQDriver(platform="alveo")) return (model, False) diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py index c9259afabfa6fcae0020e378c79ce391c218408f..eedbf97f389754440a116cf8755c25d597c433ee 100644 --- a/tests/end2end/test_end2end_cybsec_mlp.py +++ b/tests/end2end/test_end2end_cybsec_mlp.py @@ -1,3 +1,31 @@ +# Copyright (c) 2021, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import torch from brevitas.nn import QuantLinear, QuantReLU import torch.nn as nn diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py new file mode 100644 index 0000000000000000000000000000000000000000..0407395ed57dc07c6700efcebbb1fc8a767877bb --- /dev/null +++ b/tests/end2end/test_ext_weights.py @@ -0,0 +1,157 @@ +# Copyright (c) 2021, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +import os +import shutil +from finn.util.test import get_build_env, load_test_checkpoint_or_skip +import pytest +from finn.util.basic import make_build_dir +import pkg_resources as pk +import wget +import subprocess + +target_clk_ns = 10 +build_kind = "zynq" +build_dir = os.environ["FINN_BUILD_DIR"] +onnx_zip_url = "https://github.com/Xilinx/finn-examples" +onnx_zip_url += "/releases/download/v0.0.1a/onnx-models-bnn-pynq.zip" +onnx_zip_local = build_dir + "/onnx-models-bnn-pynq.zip" +onnx_dir_local = build_dir + "/onnx-models-bnn-pynq" + + +def get_checkpoint_name(step): + if step == "build": + # checkpoint for build step is an entire dir + return build_dir + "/end2end_ext_weights_build" + elif step == "download": + return onnx_dir_local + "/tfc-w1a1.onnx" + else: + # other checkpoints are onnx files + return build_dir + "/end2end_ext_weights_%s.onnx" % (step) + + +def test_end2end_ext_weights_download(): + if not os.path.isfile(onnx_zip_local): + wget.download(onnx_zip_url, out=onnx_zip_local) + assert os.path.isfile(onnx_zip_local) + subprocess.check_output(["unzip", "-o", onnx_zip_local, "-d", onnx_dir_local]) + assert os.path.isfile(get_checkpoint_name("download")) + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_ext_weights_build(): + model_file = get_checkpoint_name("download") + load_test_checkpoint_or_skip(model_file) + build_env = get_build_env(build_kind, target_clk_ns) + folding_config_file = pk.resource_filename( + "finn.qnn-data", "test_ext_weights/tfc-w1a1-extw.json" + ) + output_dir = make_build_dir("test_end2end_ext_weights_build") + cfg = build.DataflowBuildConfig( + output_dir=output_dir, + folding_config_file=folding_config_file, + synth_clk_period_ns=target_clk_ns, + board=build_env["board"], + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.BITFILE, + build_cfg.DataflowOutputType.PYNQ_DRIVER, + build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE, + ], + ) + build.build_dataflow_cfg(model_file, cfg) + assert os.path.isfile(output_dir + "/deploy/bitfile/finn-accel.bit") + assert os.path.isfile(output_dir + "/deploy/bitfile/finn-accel.hwh") + assert os.path.isfile(output_dir + "/deploy/driver/driver.py") + assert os.path.isfile(output_dir + "/deploy/driver/runtime_weights/idma0.npy") + if os.path.isdir(get_checkpoint_name("build")): + shutil.rmtree(get_checkpoint_name("build")) + shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build")) + + +def test_end2end_ext_weights_run_on_hw(): + build_env = get_build_env(build_kind, target_clk_ns) + deploy_dir = get_checkpoint_name("build") + if not os.path.isdir(deploy_dir): + pytest.skip(deploy_dir + " not found from previous test step, skipping") + driver_dir = deploy_dir + "/driver" + assert os.path.isdir(driver_dir) + # create a shell script for running validation: 10 batches x 10 imgs + with open(driver_dir + "/validate.sh", "w") as f: + f.write( + """#!/bin/bash +cd %s/driver +echo %s | sudo -S python3.6 validate.py --dataset mnist --bitfile %s + """ + % ( + build_env["target_dir"] + "/end2end_ext_weights_build", + build_env["password"], + "../bitfile/finn-accel.bit", + ) + ) + # set up rsync command + remote_target = "%s@%s:%s" % ( + build_env["username"], + build_env["ip"], + build_env["target_dir"], + ) + rsync_res = subprocess.run( + [ + "sshpass", + "-p", + build_env["password"], + "rsync", + "-avz", + deploy_dir, + remote_target, + ] + ) + assert rsync_res.returncode == 0 + remote_verif_cmd = [ + "sshpass", + "-p", + build_env["password"], + "ssh", + "%s@%s" % (build_env["username"], build_env["ip"]), + "sh", + build_env["target_dir"] + "/end2end_ext_weights_build/driver/validate.sh", + ] + verif_res = subprocess.run( + remote_verif_cmd, + stdout=subprocess.PIPE, + universal_newlines=True, + input=build_env["password"], + ) + assert verif_res.returncode == 0 + log_output = verif_res.stdout.split("\n") + assert log_output[-3] == "batch 100 / 100 : total OK 9296 NOK 704" + assert log_output[-2] == "Final accuracy: 92.960000"