From e63329dce2fd9add1b2a67735fb8fdca4a5a83f7 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 4 Jun 2020 09:25:07 +0100
Subject: [PATCH] [HLSCustomOp] Add first draft of vvau node

---
 .../vector_vector_activate_batch.py           | 501 ++++++++++++++++++
 1 file changed, 501 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py

diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
new file mode 100644
index 000000000..8eec0d4f3
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -0,0 +1,501 @@
+import os
+import numpy as np
+
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    rtlsim_output_to_npy,
+)
+
+# ONNX i/o tensor shape assumptions for Vector_Vector_Activate_Batch:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class Vector_Vector_Activate_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "Dim": ("i", True, 0),
+            "Channels": ("i", True, 0),
+            "Kernel": ("i", True, 0),
+            "resType": ("s", True, ""),
+            "ActVal": ("i", False, 0),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # no-activation mode (produce accumulators)
+            "noActivation": ("i", False, 0),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": ("s", False, "auto"),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        ch = self.get_nodeattr("Channels")
+        k = self.get_nodeattr("Kernel")
+        pe = self.get_nodeattr("PE")
+        nf = ch // pe
+        sf = k * k
+        wmem = nf * sf
+        return wmem
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        if self.get_nodeattr("noActivation") == 1:
+            return 0
+        else:
+            ch = self.get_nodeattr("Channels")
+            pe = self.get_nodeattr("PE")
+            return ch // pe
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # check input datatype against property
+        idt_name = self.get_input_datatype().name
+        exp_idt_name = self.get_nodeattr("inputDataType")
+        assert exp_idt_name == idt_name, "Bad input DataType for VVAU  node"
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self):
+        i_bits = self.get_input_datatype().bitwidth()
+        in_width = i_bits * self.get_nodeattr("Channels") 
+        return in_width
+
+    def get_outstream_width(self):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_folded_input_shape(self):
+        k = self.get_nodeattr("Kernel")
+        sf = k * k
+        dim = self.get_nodeattr("Dim")
+        ch = self.get_nodeattr("Channels")
+        folded_input_shape = tuple([1, dim, dim, sf, ch])
+        return folded_input_shape
+
+    def get_folded_output_shape(self):
+        ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        nf = ch // pe
+        dim = self.get_nodeattr("Dim")
+        folded_output_shape = tuple([1, dim, dim, nf, pe])
+        return folded_output_shape
+
+    def get_normal_input_shape(self):
+        dim = self.get_nodeattr("Dim")
+        ch = self.get_nodeattr("Channels")
+        k = self.get_nodeattr("Kernel")
+        normal_input_shape = tuple([1, dim, dim, k * k * ch])
+        return normal_input_shape
+
+    def get_normal_output_shape(self):
+        ch = self.get_nodeattr("Channels")
+        dim = self.get_nodeattr("Dim")
+        normal_output_shape = tuple([1, dim, dim, ch])
+        return normal_output_shape
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType.BINARY
+        # out_is_binary = self.get_output_datatype() == DataType.BINARY
+        wt_is_binary = self.get_weight_datatype() == DataType.BINARY
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
+        inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
+        wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+        # fill in TSrcI and TWeightI
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Identity"
+
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        pe = self.get_nodeattr("PE")
+        ch = self.get_nodeattr("Channels")
+        k = self.get_nodeattr("Kernel")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            ch,
+            1,
+            k,
+            k,
+        ), """Weights matrix doesn't
+        have expected shape (channels, 1, kernel_size, kernel_size)"""
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix
+        import pdb; pdb.set_trace()
+        #if self.get_weight_datatype() == DataType.BIPOLAR:
+        #    # convert bipolar to binary
+        #    ret = (ret + 1) / 2
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = ret.reshape(ch, k * k)
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create channels as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, 1)
+        ## reverse the SIMD dimension
+        #ret = np.flip(ret, axis=-1)
+        return ret
+
+    def generate_params(self, model, path):
+        # weights
+        weights = model.get_initializer(self.onnx_node.input[1])
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        # we have converted bipolar weights to binary for export,
+        # so use it as such for weight generation
+        if self.get_weight_datatype() == DataType.BIPOLAR:
+            export_wdt = DataType.BINARY
+        code_gen_dir = path
+
+        """Saves weights into params.h"""
+        weight_hls_code = numpy_to_hls_code(
+            weight_tensor, export_wdt, "weights", True, True
+        )
+        # write weights into params.h
+        f_weights = open("{}/params.h".format(code_gen_dir), "w")
+
+        if export_wdt.bitwidth() != 1:
+            f_weights.write(
+                "const FixedPointWeights<1,{},{},{}> weights = ".format(
+                    export_wdt.get_hls_datatype_str(),
+                    self.get_nodeattr("PE"),
+                    self.calc_wmem(),
+                )
+            )
+        else:
+            f_weights.write(
+                "const BinaryWeights<1,{},{}> weights = ".format(
+                    self.get_nodeattr("PE"), self.calc_wmem(),
+                )
+            )
+        f_weights.write(weight_hls_code)
+        f_weights.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                if self.get_input_datatype() == DataType.BIPOLAR:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType.BINARY
+                else:
+                    export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception(
+                    "Unexpected input found for Vector_Vector_Activate_Unit"
+                )
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType.BIPOLAR:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_folded_output_shape()
+            ), """Output shape is not as expected"""
+            # reshape output to have expected shape
+            oshape = self.get_normal_output_shape()
+            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+        # if self.calc_tmem() != 0:
+        #    # TODO find a better way of checking for no pregenerated thresholds
+        #    self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+    def defines(self, var):
+        dim = self.get_nodeattr("Dim")
+        numReps = 1 * dim * dim
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define Channels1 {}\n #define Kernel1 {}\n
+            #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
+                self.get_nodeattr("Channels"),
+                self.get_nodeattr("Kernel"),
+                self.get_nodeattr("PE"),
+                numReps,
+            )
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        tmpl_args = self.get_template_param_values()
+        if self.calc_tmem() == 0:
+            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+        else:
+            threshs = "threshs"
+        node = self.onnx_node
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<Channels1, Kernel1, SIMD1, PE1, 1, {}, {}, {}>
+            (in0, out, weights, {}, numReps, {});""".format(
+                node.op_type,
+                tmpl_args["TSrcI"],
+                tmpl_args["TDstI"],
+                tmpl_args["TWeightI"],
+                threshs,
+                self.get_nodeattr("resType"),
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0,
+            hls::stream<ap_uint<{}>> &out
+            )""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.get_outstream_width(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        in_fifo_depth = self.get_nodeattr("inFIFODepth")
+        out_fifo_depth = self.get_nodeattr("outFIFODepth")
+        # insert depth pragmas only if specified
+        if in_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+            )
+        if out_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
+            )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+        self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+        # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
+        # partition for parallel access along the PE dimension (dim 1)
+        self.code_gen_dict["$PRAGMAS$"].append(
+            ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+        )
+
+        ## the threshold tensor is acc_type [PE][TMEM][N_THRES]
+        ## partition for parallel access along PE and N_THRES
+        ## dimensions (dims 1 and 3)
+        # if self.calc_tmem() != 0:
+        #    # TODO find a better way of checking for no pregenerated thresholds
+        #    self.code_gen_dict["$PRAGMAS$"].append(
+        #        (
+        #            "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+        #            "complete dim=1"
+        #        )
+        #    )
+        #    self.code_gen_dict["$PRAGMAS$"].append(
+        #        (
+        #            "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+        #            "complete dim=3"
+        #        )
+        #    )
-- 
GitLab