diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 332f0263859504230c91f6d11147c28aa4e0d617..0aea65fdd7999b56989239685f6606a8e1b2e618 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -12,8 +12,8 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -FINN_BASE_COMMIT=8b33862ea6955234e59cc52888f268cc690acf90 -BREVITAS_COMMIT=d579814b62ab33af0cd24fef49a6a34dc7e2f9b3 +FINN_BASE_COMMIT=f2e5f0582ef2b7cbc134168993816c337ca8d3a6 +BREVITAS_COMMIT=b75e0408d9759ed519296e3af29b9c16fb94b0b8 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e PYVERILATOR_COMMIT=06c29ecf3ba0361e3d0a75c98f6918ba67bf0e27 diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index 4d1a0bfa76d01f10706748b0200ac7fd3d312db7..2c714b1f12b75e9789f1865d6737422f4d9d9a97 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -45,3 +45,41 @@ def res_estimation(model): res_dict[node.name] = inst.node_res_estimation() return res_dict + + +def res_estimation_complete(model): + """Estimates the resources needed for the given model and all values for + resource-related switches. + Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames + transformation) prior to calling this analysis pass to ensure all nodes are + visible in the results. + + Returns {node name : [resource estimation(s)]}.""" + + res_dict = {} + for node in model.graph.node: + if is_fpgadataflow_node(node) is True: + op_type = node.op_type + inst = registry.getCustomOp(node) + if op_type == "StreamingFCLayer_Batch" or op_type == "Vector_Vector_Activate_Batch": + orig_restype = inst.get_nodeattr("resType") + res_dict[node.name] = [] + inst.set_nodeattr("resType", "dsp") + res_dict[node.name].append(inst.node_res_estimation()) + inst.set_nodeattr("resType", "lut") + res_dict[node.name].append(inst.node_res_estimation()) + inst.set_nodeattr("resType", orig_restype) + elif op_type == "ConvolutionInputGenerator": + orig_ramstyle = inst.get_nodeattr("ram_style") + res_dict[node.name] = [] + inst.set_nodeattr("ram_style", "block") + res_dict[node.name].append(inst.node_res_estimation()) + inst.set_nodeattr("ram_style", "distributed") + res_dict[node.name].append(inst.node_res_estimation()) + inst.set_nodeattr("ram_style", "ultra") + res_dict[node.name].append(inst.node_res_estimation()) + inst.set_nodeattr("ram_style", orig_ramstyle) + else: + res_dict[node.name] = [inst.node_res_estimation()] + + return res_dict diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py index ddd431cb2a9ba5ae64d21d12ed0313c2379ee497..635f37d5695a56d7c22f2287030ccb7331ab347b 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -95,11 +95,11 @@ class ChannelwiseOp_Batch(HLSCustomOp): my_attrs = { # channelwise "map" function to apply: # one of cmp_le, cmp_ge, add, mul - "Func": ("s", False, "cmp_le"), + "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}), "PE": ("i", True, 0), "NumChannels": ("i", True, 0), # string defining memory resource type for parameters - "ram_style": ("s", False, "distributed"), + "ram_style": ("s", False, "distributed", {"distributed", "block"}), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "paramDataType": ("s", True, ""), diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 3044ed46f9524e035b95661eed5b7043c84dfdc6..3f400053df8de6ec1e53e39fb5a3edee15f3ab30 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -28,6 +28,7 @@ import os +import math import numpy as np from finn.core.datatype import DataType @@ -69,13 +70,18 @@ class ConvolutionInputGenerator(HLSCustomOp): # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - "depthwise": ("i", False, 0), + "depthwise": ("i", False, 0, {0, 1}), # FPGA resource type for ConvolutionInputGenerator input buffer # auto -- let Vivado HLS decide # block -- use BRAM # distributed -- use LUTRAM # ultra -- use URAM - "ram_style": ("s", False, "distributed"), + "ram_style": ( + "s", + False, + "distributed", + {"auto", "block", "distributed", "ultra"}, + ), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -194,6 +200,75 @@ class ConvolutionInputGenerator(HLSCustomOp): return int(exp_cycles) + def bram_estimation(self): + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim") + k = self.get_nodeattr("ConvKernelDim") + stride = self.get_nodeattr("Stride") + ram_style = self.get_nodeattr("ram_style") + if ram_style == "block" or ram_style == "auto": + ram_depth = ifm_dim * ifm_ch / simd + if ram_depth <= 512: + ram_width = 36 + elif ram_depth <= 1024: + ram_width = 18 + elif ram_depth <= 2048: + ram_width = 9 + elif ram_depth <= 4096: + ram_width = 4 + elif ram_depth <= 8192: + ram_width = 2 + else: + ram_width = 1 + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) + * math.ceil(ifm_dim * ifm_ch / simd / ram_depth) + ) + ) + else: + return 0 + + def lut_estimation(self): + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim") + k = self.get_nodeattr("ConvKernelDim") + stride = self.get_nodeattr("Stride") + ram_style = self.get_nodeattr("ram_style") + if ram_style == "distributed": + ram_luts = int( + (k + stride) + * ( + simd + * self.get_input_datatype().bitwidth() + * math.ceil(ifm_dim * ifm_ch / simd / 64) + ) + ) + else: + ram_luts = 0 + return 300 + ram_luts + + def uram_estimation(self): + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim") + k = self.get_nodeattr("ConvKernelDim") + stride = self.get_nodeattr("Stride") + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / 64) + * math.ceil(ifm_dim * ifm_ch / simd / 4096) + ) + ) + else: + return 0 + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py index f4f2b89f076d5c181fe57bf030d1a59706e301db..e8efa3abb4e75830bf31cd88c8cb21f517e0a9f7 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -28,7 +28,7 @@ class FMPadding_Batch(HLSCustomOp): # controls distribution of padded pixels # in case of uneven padding -- see FMPadding fxn # in hlslib - "PaddingStyle": ("i", False, 2), + "PaddingStyle": ("i", False, 2, {2, 1}), # shape describing input vecs per execution "numInputVectors": ("i", False, 1), } diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 436384c51a2e629af051354619744928c1187feb..06cc2d253d577fe14ae965e07868dea4e656d927 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -82,7 +82,7 @@ class HLSCustomOp(CustomOp): "ipgen_path": ("s", False, ""), "ip_path": ("s", False, ""), "ip_vlnv": ("s", False, ""), - "exec_mode": ("s", False, ""), + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim"}), "cycles_rtlsim": ("i", False, 0), "cycles_estimate": ("i", False, 0), "rtlsim_trace": ("s", False, ""), @@ -194,6 +194,8 @@ class HLSCustomOp(CustomOp): ret["BRAM_18K"] = self.bram_estimation() ret["BRAM_efficiency"] = self.bram_efficiency_estimation() ret["LUT"] = self.lut_estimation() + ret["URAM"] = self.uram_estimation() + ret["DSP"] = self.dsp_estimation() return ret def bram_efficiency_estimation(self): @@ -206,11 +208,21 @@ class HLSCustomOp(CustomOp): HLSCustomOp class but has to be filled by every node""" return 0 + def uram_estimation(self): + """Function for UltraRAM resource estimation, is member function of + HLSCustomOp class but has to be filled by every node""" + return 0 + def lut_estimation(self): """Function for LUT resource estimation, is member function of HLSCustomOp class but has to be filled by every node""" return 0 + def dsp_estimation(self): + """Function for DSP resource estimation, is member function of + HLSCustomOp class but has to be filled by every node""" + return 0 + def get_exp_cycles(self): """Function for estimation of expected cycles for set folding, is member function of HLSCustomOp class but has to be filled diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index 2976dae223420ae17e8d92562866d08cda890a64..dc5c37619dae26ceedbcede0032caa930c16f9dd 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -87,8 +87,8 @@ class IODMA(HLSCustomOp): "streamWidth": ("i", False, 32), # DMA-specific parameters "intfWidth": ("i", False, 32), - "burstMode": ("s", False, "increment"), - "direction": ("s", False, "in"), + "burstMode": ("s", False, "increment", {"wrap", "increment"}), + "direction": ("s", False, "in", {"in", "out"}), # shape describing input vecs per execution "numInputVectors": ("ints", False, [1]), # name of axi-mm interface diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py index b76e5432dc51efc604fc69b31c10f08442e9600d..edba084b5258de37198520257e438f90f8cc65e3 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -60,9 +60,9 @@ class Pool_Batch(HLSCustomOp): "KernelSize": ("i", True, 0), # Function: # - MaxPool - # - AvgPool (not yet supported, but HLSLIB does) - # - AccPool (not yet supported, but HLSLIB does) - "Function": ("s", True, ""), + # - QuantAvgPool + # TODO add support for AvgPool and AccPool + "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}), "OutImgDim": ("i", True, 0), # FINN DataTypes for inputs/outputs "InputDataType": ("s", True, ""), diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py index 1bc20fb3febe3ff411056664c1f1d0c439d9cda1..16ec6587861c7de6829f812fb539d6fc40c2ece4 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py @@ -55,7 +55,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): # Toggle between hls or IPI implementation # hls - use the hls generated IP during stitching # vivado - use the AXI Infrastructure DWC - "impl_style": ("s", False, "hls"), + "impl_style": ("s", False, "hls", {"hls", "vivado"}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 0d7fa341d0e8ecd465731a26403a95b97de8cd98..b4f85c29bd5233e65b40b2bd580b33c714baf378 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -68,7 +68,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", True, ""), + "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -78,9 +78,9 @@ class StreamingFCLayer_Batch(HLSCustomOp): "accDataType": ("s", False, "INT32"), # use xnor-popcount for binary weights/inputs, thus treating them # as bipolar - "binaryXnorMode": ("i", False, 0), + "binaryXnorMode": ("i", False, 0, {0, 1}), # no-activation mode (produce accumulators) - "noActivation": ("i", False, 0), + "noActivation": ("i", False, 0, {0, 1}), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -90,13 +90,13 @@ class StreamingFCLayer_Batch(HLSCustomOp): # const -- embedded weights, default, long compile/synth times # decoupled -- streaming weights with weight streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const"), + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), # FPGA resource type for memories in decoupled mode # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM # see also https://www.xilinx.com/support/answers/38070.html - "ram_style": ("s", False, "auto"), + "ram_style": ("s", False, "auto", {"auto", "block", "distributed"}), # (mem_mode = decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -106,7 +106,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): # always "flush" the accelerator by first passing a dummy input # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. - "runtime_writeable_weights": ("i", False, 0), + "runtime_writeable_weights": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -231,7 +231,27 @@ class StreamingFCLayer_Batch(HLSCustomOp): D_in = self.get_nodeattr("MW") D_out = self.get_nodeattr("MH") omega = (D_in * D_out) / (Q * P) - return P * (math.ceil(omega / 512)) * (math.ceil((Q * W) / 36)) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + return 0 + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() @@ -239,6 +259,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): D_in = self.get_nodeattr("MW") D_out = self.get_nodeattr("MH") bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 wbits = W * D_in * D_out bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity @@ -254,6 +276,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): # TODO add in/out FIFO contributions P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") wdt = self.get_weight_datatype() W = wdt.bitwidth() # determine tdt with input and weight data types @@ -262,8 +285,55 @@ class StreamingFCLayer_Batch(HLSCustomOp): # parameters from experiments in paper mentioned above c0 = 300 c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_bits = W + A + np.ceil(math.log(MW, 2)) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2 ** B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2 ** B - 1) * acc_bits + + return int( + c0 + + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c2 + ) - return c0 + c1 * (P * Q) * (W * A) + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) def get_exp_cycles(self): pe = self.get_nodeattr("PE") @@ -915,6 +985,11 @@ class StreamingFCLayer_Batch(HLSCustomOp): def docompute(self): mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } tmpl_args = self.get_template_param_values() if self.calc_tmem() == 0: odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() @@ -931,7 +1006,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): tmpl_args["TDstI"], tmpl_args["TWeightI"], threshs, - self.get_nodeattr("resType"), + map_to_hls_mult_style[self.get_nodeattr("resType")], ) ] elif mem_mode == "decoupled" or mem_mode == "external": @@ -949,7 +1024,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): tmpl_args["TWeightI"], wdtype_hls_str, threshs, - self.get_nodeattr("resType"), + map_to_hls_mult_style[self.get_nodeattr("resType")], ) ] diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index 358e2231483453a9e38a9e6e1d96c88ebef514d5..21534f9ab0b7d571c8c492115930ecd05e098856 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -56,13 +56,18 @@ class StreamingFIFO(HLSCustomOp): # Toggle between hls or IPI implementation # rtl - use the hls generated IP during stitching # vivado - use the AXI Infrastructure FIFO - "impl_style": ("s", False, "rtl"), + "impl_style": ("s", False, "rtl", {"rtl", "vivado"}), # FPGA resource type for FIFOs when impl_style is vivado # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM # ultra -- use URAM (on UltraScale+) - "ram_style": ("s", False, "auto"), + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), } my_attrs.update(super().get_nodeattr_types()) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index d896c530baed6c9ab175961f359dde1fbc70c303..8a944fe77dc938db4154bb0a2ffcff8fdaefbd72 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -70,7 +70,7 @@ class Thresholding_Batch(HLSCustomOp): # number of steps in thresholding function "numSteps": ("i", True, 1), # string defining memory type - "ram_style": ("s", False, "distributed"), + "ram_style": ("s", False, "distributed", {"distributed", "block"}), # FINN DataTypes for inputs, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -88,7 +88,7 @@ class Thresholding_Batch(HLSCustomOp): # memory mode for the thresholds # const -- embedded thresholds, default # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const"), + "mem_mode": ("s", False, "const", {"const", "decoupled"}), # (mem_mode = decoupled only) whether weights (thresholds) will be # writable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -98,7 +98,7 @@ class Thresholding_Batch(HLSCustomOp): # always "flush" the accelerator by first passing a dummy input # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. - "runtime_writeable_weights": ("i", False, 0), + "runtime_writeable_weights": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index bdc64c9d2aa894f416bf5d99213908160970ea4b..bedaf0984c39ef7603e6829961d7a3efb6ff489f 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -47,14 +47,14 @@ class TLastMarker(HLSCustomOp): # whether static or dynamic (from AXI lite) number of iterations are used "DynIters": ("i", False, 1), # direction: whether to insert or remove TLAST - "Direction": ("s", False, "out"), + "Direction": ("s", False, "out", {"out", "in"}), # width of input-output data streams, in bits "StreamWidth": ("i", True, 0), # width of individual element in stream, in bits "ElemWidth": ("i", True, 0), # Protocol: external or internal # Vitis docs recommend using qdma_axis for external, ap_axiu for internal - "Protocol": ("s", False, "external"), + "Protocol": ("s", False, "external", {"external", "internal"}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py index b50a56419bce5b9db21da930451a7b3db11e5a0c..333884f361983e2a465715f3f4119c9c6384558e 100644 --- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py +++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py @@ -1,5 +1,6 @@ import os import numpy as np +import math from onnx import TensorProto, helper from finn.core.datatype import DataType @@ -24,14 +25,14 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): "Dim": ("i", True, 0), "Channels": ("i", True, 0), "Kernel": ("i", True, 0), - "resType": ("s", True, ""), + "resType": ("s", False, "auto", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), "outputDataType": ("s", True, ""), # no-activation mode (produce accumulators) - "noActivation": ("i", False, 0), + "noActivation": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -408,6 +409,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): ) def docompute(self): + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } tmpl_args = self.get_template_param_values() if self.calc_tmem() == 0: odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() @@ -423,7 +429,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): tmpl_args["TDstI"], tmpl_args["TWeightI"], threshs, - self.get_nodeattr("resType"), + map_to_hls_mult_style[self.get_nodeattr("resType")], ) ] @@ -504,3 +510,99 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): "complete dim=3" ) ) + + def bram_estimation(self): + """Calculates resource estimation for BRAM""" + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # since this is HLS memory, not using the full width of a BRAM + # assuming memories up to 128 deep get implemented in LUTs + if self.calc_wmem() <= 128: + return 0 + + if W == 1: + return math.ceil(omega / 16384) * P + elif W == 2: + return math.ceil(omega / 8192) * P + elif W <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(W / 4)) * P + elif W <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(W / 8)) * P + elif W <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(W / 16)) * P + else: + return (math.ceil(omega / 512)) * (math.ceil(W / 32)) * P + + def bram_efficiency_estimation(self): + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * P * omega + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + if self.calc_wmem() <= 128: + c2 = P * W * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # accumulator + k = self.get_nodeattr("Kernel") + acc_bits = W + A + math.ceil(math.log(k * k, 2)) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2 ** B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2 ** B - 1) * acc_bits + + return int(c0 + c1 * (P * (mult_luts + acc_luts + thr_luts + comp_luts)) + c2) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 215cdc9a0b3cba172961a010eda24afa50687373..749cf6c91a975a2ffaffedefa77b2f3fcb793e32 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -442,7 +442,6 @@ class InferBinaryStreamingFCLayer(Transformation): [mt_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, @@ -473,7 +472,6 @@ class InferBinaryStreamingFCLayer(Transformation): [mm_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, @@ -577,7 +575,6 @@ class InferQuantizedStreamingFCLayer(Transformation): [mt_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, @@ -608,7 +605,6 @@ class InferQuantizedStreamingFCLayer(Transformation): [mm_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, @@ -728,7 +724,7 @@ class InferVVAU(Transformation): [mt_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", + resType="lut", PE=pe, Dim=mm_in_shape[1], Channels=channels, @@ -756,7 +752,7 @@ class InferVVAU(Transformation): [mm_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", + resType="lut", PE=pe, Dim=mm_in_shape[1], Channels=channels, diff --git a/src/finn/util/create.py b/src/finn/util/create.py new file mode 100644 index 0000000000000000000000000000000000000000..d9c5d7b1b59916edfc8730992535f3ddb57c4d60 --- /dev/null +++ b/src/finn/util/create.py @@ -0,0 +1,178 @@ +# Copyright (c) 2020 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor + + +def hls_random_mlp_maker(layer_spec): + """Create an MLP of given specification using HLSCustomOp instances. + Generate random weights/thresholds of appropriate size.""" + ret = [] + for lyr in layer_spec: + idt = lyr["idt"] + wdt = lyr["wdt"] + mw = lyr["mw"] + mh = lyr["mh"] + act = lyr["act"] + lyr["W"] = gen_finn_dt_tensor(wdt, (mw, mh)) + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: + odt = DataType.UINT32 + else: + odt = DataType.INT32 + else: + odt = act + (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: + tdt = DataType.UINT32 + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType.INT32 + lyr["T"] = T + lyr["tdt"] = tdt + lyr["odt"] = odt + ret.append(lyr) + + return hls_mlp_maker(ret) + + +def hls_mlp_maker(layer_spec): + """Create an MLP of given specification using HLSCustomOp instances.""" + + current_in_name = "" + current_out_name = "" + i = 0 + + graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[]) + + model = helper.make_model(graph, producer_name="finn") + model = ModelWrapper(model) + + for lyr in layer_spec: + current_W_name = "W_%d" % i + current_T_name = "T_%d" % i + current_in_name = "act_%d" % i + current_out_name = "act_%d" % (i + 1) + + W = lyr["W"] + (mw, mh) = W.shape + T = lyr["T"] + pe = lyr["pe"] + simd = lyr["simd"] + wdt = lyr["wdt"] + idt = lyr["idt"] + tdt = lyr["tdt"] + odt = lyr["odt"] + + if i == 0: + global_in = helper.make_tensor_value_info( + current_in_name, TensorProto.FLOAT, [1, mw] + ) + model.graph.input.append(global_in) + + if i == len(layer_spec) - 1: + global_out = helper.make_tensor_value_info( + current_out_name, TensorProto.FLOAT, [1, mh] + ) + model.graph.output.append(global_out) + + # there are two ways to implement bipolar weights and inputs for + # StreamingFC: + # - specify their datatypes as such + # - specify their datatypes as BINARY as use binaryXnorMode + if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: + # we'll internally convert weights/inputs to binary and specify the + # datatypes as such, and also set the binaryXnorMode attribute to 1 + export_wdt = DataType.BINARY + export_idt = DataType.BINARY + binary_xnor_mode = 1 + else: + export_wdt = wdt + export_idt = idt + binary_xnor_mode = 0 + + if T is not None: + no_act = 0 + node_inp_list = [current_in_name, current_W_name, current_T_name] + if odt == DataType.BIPOLAR: + actval = 0 + else: + actval = odt.min() + else: + # no thresholds + node_inp_list = [current_in_name, current_W_name] + actval = 0 + no_act = 1 + FCLayer_node = helper.make_node( + "StreamingFCLayer_Batch", + node_inp_list, + [current_out_name], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=export_idt.name, + weightDataType=export_wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=binary_xnor_mode, + noActivation=no_act, + ) + + model.graph.node.append(FCLayer_node) + model.set_tensor_datatype(current_in_name, idt) + model.set_tensor_datatype(current_out_name, odt) + model.set_tensor_datatype(current_W_name, wdt) + if binary_xnor_mode: + # convert bipolar to binary + model.set_initializer(current_W_name, (W + 1) / 2) + else: + model.set_initializer(current_W_name, W) + if T is not None: + model.set_tensor_datatype(current_T_name, tdt) + model.set_initializer(current_T_name, T) + i += 1 + + return model diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py index 287cb83752f8b1f935373dd2492fc9859cadd354..cf3e064804216e192909eae75f01880554f03d9f 100644 --- a/tests/fpgadataflow/test_code_gen_trafo.py +++ b/tests/fpgadataflow/test_code_gen_trafo.py @@ -55,7 +55,6 @@ def test_code_gen_trafo(): backend="fpgadataflow", code_gen_dir="", executable_path="", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py index 811b2741c83a1354cfe9a44384e1c2fbbe3f4e3b..a12c69285b7b335f075d8ffd7ba27e039ebc6f8c 100644 --- a/tests/fpgadataflow/test_compilation_trafo.py +++ b/tests/fpgadataflow/test_compilation_trafo.py @@ -57,7 +57,6 @@ def test_compilation_trafo(): backend="fpgadataflow", code_gen_dir="", executable_path="", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py index 436d67231266faa4699ef2b10916fec13e875f2c..00f1ba5d59288b1a463fadbd684ff872269d6970 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py +++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py @@ -90,7 +90,6 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index a1ca0365669111e60b7302d86d03d3d6469af783..306844c7ef3828d8483d3b0006491864f1525e21 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -90,7 +90,6 @@ def create_one_fc_model(mem_mode="const"): ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=m, MH=m, SIMD=simd, @@ -145,7 +144,6 @@ def create_two_fc_model(mem_mode="decoupled"): ["mid"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=m, MH=m, SIMD=simd, @@ -165,7 +163,6 @@ def create_two_fc_model(mem_mode="decoupled"): ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=m, MH=m, SIMD=simd, diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index 61b83f12758593cb8832f412de5c3aaf93053fd8..06ebd90000e7466b2781d3284c5a0a0e56733dea 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -28,7 +28,10 @@ from onnx import TensorProto, helper -from finn.analysis.fpgadataflow.res_estimation import res_estimation +from finn.analysis.fpgadataflow.res_estimation import ( + res_estimation, + res_estimation_complete, +) from finn.core.datatype import DataType from finn.core.modelwrapper import ModelWrapper from finn.transformation.general import GiveUniqueNodeNames @@ -53,7 +56,7 @@ def test_res_estimate(): pe = 1 idt = DataType.INT2 wdt = DataType.INT2 - odt = DataType.INT32 + odt = DataType.INT2 actval = odt.min() inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, mw]) @@ -66,7 +69,6 @@ def test_res_estimate(): ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="ap_resource_lut()", MW=mw, MH=mh, SIMD=simd, @@ -93,13 +95,28 @@ def test_res_estimate(): prod_resource_estimation = model.analysis(res_estimation) expect_resource_estimation = { "StreamingFCLayer_Batch_0": { - "BRAM_18K": 1, - "BRAM_efficiency": 0.001736111111111111, - "LUT": 304.4, + "BRAM_18K": 0, + "BRAM_efficiency": 1, + "LUT": 357, + "DSP": 0, + "URAM": 0, } } assert check_two_dict_for_equality( prod_resource_estimation, expect_resource_estimation ), """The produced output of - the resource estimation analysis pass is not equal to the expected one""" + the res_estimation analysis pass is not equal to the expected one""" + + prod_resource_estimation = model.analysis(res_estimation_complete) + expect_resource_estimation = { + "StreamingFCLayer_Batch_0": [ + {"BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 352, "DSP": 1, "URAM": 0}, + {"BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 357, "DSP": 0, "URAM": 0}, + ] + } + + assert check_two_dict_for_equality( + prod_resource_estimation, expect_resource_estimation + ), """The produced output of + the res_estimation_complete analysis pass is not equal to the expected one""" diff --git a/tests/util/test_create.py b/tests/util/test_create.py new file mode 100644 index 0000000000000000000000000000000000000000..42a288b74ecda9746296519b1b86563c75b2752e --- /dev/null +++ b/tests/util/test_create.py @@ -0,0 +1,65 @@ +# Copyright (c) 2020 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import finn.util.create as create +from finn.core.datatype import DataType + + +@pytest.mark.parametrize("bitwidth", [DataType.BIPOLAR, DataType.INT2, DataType.INT4]) +def test_hls_random_mlp_maker(bitwidth): + w = bitwidth + a = bitwidth + layer_spec = [ + { + "mw": 185, + "mh": 100, + "simd": 185, + "pe": 100, + "idt": DataType.BIPOLAR, + "wdt": w, + "act": a, + }, + {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a}, + {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a}, + {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a}, + { + "mw": 100, + "mh": 1, + "simd": 100, + "pe": 1, + "idt": a, + "wdt": w, + "act": DataType.BIPOLAR, + }, + ] + + ret = create.hls_random_mlp_maker(layer_spec) + assert len(ret.graph.node) == 5 + # ret.save("mlp-%s.onnx" % str(bitwidth))