diff --git a/src/finn/custom_op/fpgadataflow/sameresize_batch.py b/src/finn/custom_op/fpgadataflow/sameresize_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..cf279dcc889d3afaa4da96833067e36371e6fc01 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/sameresize_batch.py @@ -0,0 +1,301 @@ +import os +import numpy as np +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class SameResize_Batch(HLSCustomOp): + """Class that corresponds to finn-hlslib SameResize function. + Implements 'same' padding on a given input image.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "ImgDim": ("i", True, 0), + "KernelDim": ("i", True, 0), + "Stride": ("i", True, 0), + "NumChannels": ("i", True, 0), + # FINN input datatype + "inputDataType": ("s", True, ""), + # distribution of added values to achieve "same" padding + "PaddingStyle": ("i", True, 2), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + idim = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + + ishape = (1, idim, idim, num_ch) + return ishape + + def get_normal_output_shape(self): + idim = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + kdim = self.get_nodeattr("KernelDim") + stride = self.get_nodeattr("Stride") + assert idim % stride == 0, "Stride must divide input dimension." + # number of "same" windows over the input data + same_windows = idim // stride + odim = kdim + stride * (same_windows - 1) + + oshape = (1, odim, odim, num_ch) + return oshape + + def get_folded_input_shape(self): + # even though there is no folding in the current hlslib op, + # insert a time multiplexing axis to remain compatible with the + # shapes produced by the rest of the dataflow pipeline + ret = list(self.get_normal_input_shape()) + ret.insert(-1, 1) + return tuple(ret) + + def get_folded_output_shape(self): + # even though there is no folding in the current hlslib op, + # insert a time multiplexing axis to remain compatible with the + # shapes produced by the rest of the dataflow pipeline + ret = list(self.get_normal_output_shape()) + ret.insert(-1, 1) + return tuple(ret) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for SameResize." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output. (Same as input datatype)""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_instream_width(self): + ibits = self.get_input_datatype().bitwidth() + num_ch = self.get_nodeattr("NumChannels") + + return ibits * num_ch + + def get_outstream_width(self): + obits = self.get_output_datatype().bitwidth() + num_ch = self.get_nodeattr("NumChannels") + + return obits * num_ch + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + numReps = 1 + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim1 {}\n #define KernelDim1 {}\n + #define Stride1 {}\n #define NumChannels1 {}\n + #define PaddingStyle1 {}\n #define numReps {}""".format( + self.get_nodeattr("ImgDim"), + self.get_nodeattr("KernelDim"), + self.get_nodeattr("Stride"), + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PaddingStyle"), + numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + in_t = self.get_input_datatype().get_hls_datatype_str() + node = self.onnx_node + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ImgDim1, KernelDim1, Stride1, NumChannels1, + {}, PaddingStyle1> (in0, out, numReps);""".format( + node.op_type, in_t + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_hls_type, packed_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim, ImgDim, NumChannels).""" + if self.get_input_datatype() == DataType.BIPOLAR: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType.BINARY + else: + export_idt = self.get_input_datatype() + + # no reshaping for input since assuming no folding on input + # make copy before saving array + inp = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), inp) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim \ + did not produce expected ofolded utput shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType.BIPOLAR: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 0d62862c222b44d2e507a90a80bfcd4fa405d3fe..238829e03353d79fab7c51e7d1b9dca6e2a96a11 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -44,6 +44,7 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch +from finn.custom_op.fpgadataflow.sameresize_batch import SameResize_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch @@ -64,6 +65,7 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch +custom_op["SameResize_Batch"] = SameResize_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch diff --git a/tests/fpgadataflow/test_fpgadataflow_sameresize.py b/tests/fpgadataflow/test_fpgadataflow_sameresize.py new file mode 100644 index 0000000000000000000000000000000000000000..5c4401e1632ad24e7af14729e148c2308762e161 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_sameresize.py @@ -0,0 +1,195 @@ +import pytest +import os +import numpy as np + +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import gen_finn_dt_tensor +import finn.core.onnx_exec as oxe +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim + +from finn.util.basic import pynq_part_map + +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 + + +def make_single_sameresize_modelwrapper( + idim, odim, kdim, stride, num_ch, idt, pad_style +): + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, idim, idim, num_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, odim, odim, num_ch] + ) + + SameResize_node = helper.make_node( + "SameResize_Batch", + ["inp"], + ["outp"], + domain="finn", + backend="fpgadataflow", + ImgDim=idim, + KernelDim=kdim, + Stride=stride, + NumChannels=num_ch, + inputDataType=str(idt.name), + PaddingStyle=pad_style, + ) + + graph = helper.make_graph( + nodes=[SameResize_node], name="sameresize_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="sameresize-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + + return model + + +# image dimension +@pytest.mark.parametrize("idim", [8, 16]) +# kernel dimension +@pytest.mark.parametrize("kdim", [2, 3]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# number of channels +@pytest.mark.parametrize("num_ch", [1, 2]) +# FINN input datatype +@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4]) +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_sameresize_cppsim(idim, kdim, stride, num_ch, idt): + pad_style = 2 + assert idim % stride == 0, "Stride must divide input dimension." + # number of "same" windows over the input data + same_windows = idim // stride + odim = kdim + stride * (same_windows - 1) + + # generate input data + x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) + input_dict = {"inp": x} + + model = make_single_sameresize_modelwrapper( + idim, odim, kdim, stride, num_ch, idt, pad_style + ) + model = model.transform(InferShapes()) + model = model.transform(SetExecMode("cppsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + expected_oshape = (1, odim, odim, num_ch) + assert y_produced.shape == expected_oshape + + # calculate reference + # calculate correct padding according to parameters + pad = odim - idim + if pad_style == 2: + if pad % 2 == 0: + pad_up = pad // 2 + pad_left = pad // 2 + else: + pad_up = pad // 2 + 1 + pad_left = pad // 2 + 1 + else: + pad_up = pad // 2 + pad_left = pad // 2 + pad_down = pad - pad_up + pad_right = pad - pad_left + + # use numpy padding function as reference + if idt == DataType.BIPOLAR: + y_expected = np.pad( + x, + ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), + "constant", + constant_values=-1, + ) + else: + y_expected = np.pad( + x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant" + ) + + assert (y_produced == y_expected).all() + + +# image dimension +@pytest.mark.parametrize("idim", [8, 16]) +# kernel dimension +@pytest.mark.parametrize("kdim", [2, 3]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# number of channels +@pytest.mark.parametrize("num_ch", [1, 2]) +# FINN input datatype +@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4]) +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_sameresize_rtlsim(idim, kdim, stride, num_ch, idt): + pad_style = 2 + assert idim % stride == 0, "Stride must divide input dimension." + # number of "same" windows over the input data + same_windows = idim // stride + odim = kdim + stride * (same_windows - 1) + + # generate input data + x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) + input_dict = {"inp": x} + + model = make_single_sameresize_modelwrapper( + idim, odim, kdim, stride, num_ch, idt, pad_style + ) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + expected_oshape = (1, odim, odim, num_ch) + assert y_produced.shape == expected_oshape + + # calculate reference + # calculate correct padding according to parameters + pad = odim - idim + if pad_style == 2: + if pad % 2 == 0: + pad_up = pad // 2 + pad_left = pad // 2 + else: + pad_up = pad // 2 + 1 + pad_left = pad // 2 + 1 + else: + pad_up = pad // 2 + pad_left = pad // 2 + pad_down = pad - pad_up + pad_right = pad - pad_left + + # use numpy padding function as reference + if idt == DataType.BIPOLAR: + y_expected = np.pad( + x, + ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), + "constant", + constant_values=-1, + ) + else: + y_expected = np.pad( + x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant" + ) + + assert (y_produced == y_expected).all()