Initial prototype

cac1ea3a · Felix Jentzsch · 82739b52 · cac1ea3a · cac1ea3a · cac1ea3a
Commit cac1ea3a authored 3 years ago by Felix Jentzsch
--- a/finn-rtllib/swg/swg_hdl_template.v
+++ b/finn-rtllib/swg/swg_hdl_template.v
+// ==============================================================
+// RTL generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and OpenCL
+// Version: 2020.1
+// Copyright (C) 1986-2020 Xilinx, Inc. All Rights Reserved.
+// 
+// ===========================================================
+`timescale 1 ns / 1 ps 
+module window_buffer 
+#(
+    parameter IN_WIDTH = 1, //c*bit-width
+    parameter OUT_WIDTH = 1, //c*bit-width*MMV_out
+    parameter BUFFER_ELEM_TOTAL = 1
+)
+(
+    CLK,
+    data_in,
+    shift_enable,
+    data_out
+);
+input CLK;
+input [IN_WIDTH-1:0] data_in;
+input shift_enable;
+output [OUT_WIDTH-1:0] data_out;
+//Input REG to enable simultaneous R/W
+reg [IN_WIDTH-1:0] reg_input;
+//REG FIFOs
+$GENERATE_REG_FIFOS$
+//BRAM FIFOs
+//todo: generate real BRAM shift buffers if these get too large
+$GENERATE_BRAM_FIFOS$
+//Fixed REG FIFO <-> output mapping
+$GENERATE_OUTPUT_MAPPING$
+//main process
+integer i;
+always @ (posedge CLK) begin
+    if (shift_enable) begin
+        //shift logic
+        $GENERATE_SHIFT_LOGIC$
+        //shift in new data
+        reg_input <= data_in;
+    end
+end
+endmodule //window_buffer
+module $TOP_MODULE_NAME$ (
+        ap_clk,
+        ap_rst_n,
+        in0_V_V_TDATA,
+        in0_V_V_TVALID,
+        in0_V_V_TREADY,
+        out_V_V_TDATA,
+        out_V_V_TVALID,
+        out_V_V_TREADY
+);
+//parameters
+parameter BIT_WIDTH = $BIT_WIDTH$;
+parameter SIMD = $SIMD$; //assuming SIMD=C for now
+parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1 for now
+parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K for now
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //c*bit-width
+parameter BUF_OUT_WIDTH = BUF_IN_WIDTH * MMV_OUT; //c*bit-width*MMV_out
+parameter CYCLES_TOTAL = $CYCLES_TOTAL$;
+parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$;
+//IO ports
+input   ap_clk;
+input   ap_rst_n;
+input  [BUF_IN_WIDTH-1:0] in0_V_V_TDATA;
+input   in0_V_V_TVALID;
+output   in0_V_V_TREADY;
+output  [BUF_OUT_WIDTH-1:0] out_V_V_TDATA;
+output   out_V_V_TVALID;
+input   out_V_V_TREADY;
+//main buffer instantiation
+wire [BUF_IN_WIDTH-1:0] window_buffer_in;
+wire [BUF_OUT_WIDTH-1:0] window_buffer_out;
+wire window_buffer_shift_enable;
+window_buffer
+#(
+    .IN_WIDTH(BUF_IN_WIDTH),
+    .OUT_WIDTH(BUF_OUT_WIDTH),
+    .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL)
+)
+window_buffer_inst
+(
+    .CLK(ap_clk),
+    .data_in(window_buffer_in),
+    .shift_enable(window_buffer_shift_enable),
+    .data_out(window_buffer_out)
+);
+//FSM state
+reg [1:0] state;
+parameter STATE_RESET = 0, STATE_OPERATE = 1, S2 = 2;
+//main cycle counter (where either read/write/both happen, resets for each image)
+integer cycle;
+//read/write loop state
+wire read_state;
+wire write_state;
+//output registers
+reg   out_V_V_TVALID_reg;
+//assign buffer control
+//todo: if mmv_out < k: might not shift and/or write for multiple read_state cycles
+assign window_buffer_shift_enable = (read_state && in0_V_V_TVALID) || write_state;
+//assign I/O ports
+assign window_buffer_in = in0_V_V_TDATA;
+assign in0_V_V_TREADY = read_state; //accept data whenever read loop wants to read
+assign out_V_V_TDATA = window_buffer_out; //out_V_V_TDATA_reg;
+assign out_V_V_TVALID = out_V_V_TVALID_reg;
+//read schedule
+//todo: generate differently
+$GENERATE_READ_SCHEDULE$
+//write schedule
+//todo: generate differently
+$GENERATE_WRITE_SCHEDULE$
+//read process (writing to buffer)
+always @ (posedge ap_clk) begin
+    if (ap_rst_n == 1'b0) begin
+        state <= STATE_RESET;
+    end else begin
+        case (state)
+            STATE_RESET: begin
+                state <= STATE_OPERATE;
+                cycle <= 0;
+            end
+            STATE_OPERATE: begin
+                if (read_state && in0_V_V_TVALID) begin
+                    //read into buffer
+                      //done in concurrent assignment
+                    //count cycle (R)
+                    cycle <= cycle+1;
+                    if (cycle == CYCLES_TOTAL-1)
+                        state <= STATE_RESET;
+                end else if (write_state && out_V_V_TREADY) begin
+                    cycle <= cycle+1; //count cycle (or W)
+                    if (cycle == CYCLES_TOTAL-1)
+                        state <= STATE_RESET;
+                end
+            end
+        endcase
+    end
+end
+//write process (reading from buffer)
+always @ (posedge ap_clk) begin
+    if (ap_rst_n == 1'b0) begin
+    end else begin
+        case (state)
+            STATE_RESET: begin
+            end
+            STATE_OPERATE: begin
+                if (write_state && out_V_V_TREADY) begin
+                    //write from buffer
+                    //todo: VALID seems to be deasserted 1 cycle too late?!
+                    out_V_V_TVALID_reg <= 1'b1;
+                end else begin
+                    out_V_V_TVALID_reg <= 1'b0;
+                end
+            end
+        endcase 
+    end
+end      
+endmodule //ConvolutionInputGenerator1D_0_ConvolutionInputGenerator1D_0
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -34,6 +34,9 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
 from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
    ConvolutionInputGenerator1D,
 )
+from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import (
+    ConvolutionInputGenerator_rtl,
+)
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
 from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
@@ -67,6 +70,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
+custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
 custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO

--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+import numpy as np
+from onnx import TensorProto, helper
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.im2col import compute_conv_output_dim
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import gen_finn_dt_tensor
+def make_single_im2col_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+    model = helper.make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    return model
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    #DEBUG
+    swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    swg_inst = getCustomOp(swg_node)
+    swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd")
+    return model
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[3, 3]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[6, 11]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [2])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 2]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 2]])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["rtlsim"])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [2])
+# depthwise
+@pytest.mark.parametrize("dw", [0])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_slidingwindow_rtl(
+    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip
+):
+    if flip:
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    #if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1):
+    #    pytest.skip(
+    #        """Dilation value greater than 1 and stride greater than 1
+    #        currently not supported for 1D convolutions"""
+    #    )
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+    if exec_mode == "cppsim":
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    golden = make_single_im2col_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+    )
+    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+    #DEBUG
+    print("-------expected:")
+    print(y_expected)
+    print("--------produced:")
+    print(y_produced)
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(
+            1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+        )
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+        assert (y_produced == y_expected).all()
+    # if exec_mode == "rtlsim":
+    #     node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    #     inst = getCustomOp(node)
+    #     cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    #     exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    #     exp_cycles = exp_cycles_dict[node.name]
+    #     assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+    #     assert exp_cycles != 0