Skip to content
Snippets Groups Projects
Unverified Commit e8fd28c6 authored by auphelia's avatar auphelia Committed by GitHub
Browse files

Merge pull request #699 from Xilinx/feature/fmpadding_dynamic_integration

RTL FMPadding and dynamic padding support
parents 56fecea7 5d65dd46
No related branches found
No related tags found
No related merge requests found
/******************************************************************************
* Copyright (C) 2022, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @brief AXI-Light adapter for trivial write enable interface.
* @author Thomas B. Preußer <tpreusse@amd.com>
*****************************************************************************/
module axi2we #(
int unsigned ADDR_BITS
)(
//- Global Control ------------------
input logic ap_clk,
input logic ap_rst_n,
//- AXI Lite ------------------------
// Writing
input s_axilite_AWVALID,
output s_axilite_AWREADY,
input [ADDR_BITS-1:0] s_axilite_AWADDR,
input s_axilite_WVALID,
output s_axilite_WREADY,
input [31:0] s_axilite_WDATA,
input [ 3:0] s_axilite_WSTRB,
output s_axilite_BVALID,
input s_axilite_BREADY,
output [1:0] s_axilite_BRESP,
// Reading tied to all-ones
input s_axilite_ARVALID,
output s_axilite_ARREADY,
input [ADDR_BITS-1:0] s_axilite_ARADDR,
output s_axilite_RVALID,
input s_axilite_RREADY,
output [31:0] s_axilite_RDATA,
output [ 1:0] s_axilite_RRESP,
// Write Enable Interface
output logic we,
output logic [ADDR_BITS-1:0] wa,
output logic [ 31:0] wd
);
uwire clk = ap_clk;
uwire rst = !ap_rst_n;
logic WABusy = 0;
logic WDBusy = 0;
logic [ADDR_BITS-1:0] Addr = 'x;
logic [ 31:0] Data = 'x;
assign we = WABusy && WDBusy && s_axilite_BREADY;
assign wa = Addr;
assign wd = Data;
uwire clr_wr = rst || we;
always_ff @(posedge clk) begin
if(clr_wr) begin
WABusy <= 0;
Addr <= 'x;
WDBusy <= 0;
Data <= 'x;
end
else begin
if(!WABusy) begin
WABusy <= s_axilite_AWVALID;
Addr <= s_axilite_AWADDR;
end
if(!WDBusy) begin
WDBusy <= s_axilite_WVALID;
Data <= s_axilite_WDATA;
end
end
end
assign s_axilite_AWREADY = !WABusy;
assign s_axilite_WREADY = !WDBusy;
assign s_axilite_BVALID = WABusy && WDBusy;
assign s_axilite_BRESP = '0; // OK
// Answer all reads with '1
logic RValid = 0;
uwire clr_rd = rst || (RValid && s_axilite_RREADY);
always_ff @(posedge clk) begin
if(clr_rd) RValid <= 0;
else if(!RValid) RValid <= s_axilite_ARVALID;
end
assign s_axilite_ARREADY = !RValid;
assign s_axilite_RVALID = RValid;
assign s_axilite_RDATA = '1;
assign s_axilite_RRESP = '0; // OK
endmodule : axi2we
/******************************************************************************
* Copyright (C) 2022, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @brief Feature map padding.
* @author Thomas B. Preußer <tpreusse@amd.com>
*****************************************************************************/
module fmpadding #(
int unsigned XCOUNTER_BITS,
int unsigned YCOUNTER_BITS,
int unsigned NUM_CHANNELS,
int unsigned SIMD,
int unsigned ELEM_BITS,
int unsigned INIT_XON,
int unsigned INIT_XOFF,
int unsigned INIT_XEND,
int unsigned INIT_YON,
int unsigned INIT_YOFF,
int unsigned INIT_YEND,
localparam int unsigned STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
)(
//- Global Control ------------------
input logic ap_clk,
input logic ap_rst_n,
// Parameter Configuration ----------
input logic we,
input logic [ 4:0] wa,
input logic [31:0] wd,
//- AXI Stream - Input --------------
output logic s_axis_tready,
input logic s_axis_tvalid,
input logic [STREAM_BITS-1:0] s_axis_tdata,
//- AXI Stream - Output -------------
input logic m_axis_tready,
output logic m_axis_tvalid,
output logic [STREAM_BITS-1:0] m_axis_tdata
);
uwire clk = ap_clk;
uwire rst = !ap_rst_n;
//-----------------------------------------------------------------------
// Parameter Sanity Checking
initial begin
automatic bit fail = 0;
if(XCOUNTER_BITS < $clog2(1+INIT_XEND)) begin
$error("XCounter size too small to accommodate end count.");
fail = 1;
end
if(XCOUNTER_BITS < $clog2(1+INIT_XON)) begin
$error("XCounter size too small to accommodate ON count.");
fail = 1;
end
if(XCOUNTER_BITS < $clog2(1+INIT_XOFF)) begin
$error("XCounter size too small to accommodate OFF count.");
fail = 1;
end
if(YCOUNTER_BITS < $clog2(1+INIT_YEND)) begin
$error("YCounter size too small to accommodate end count.");
fail = 1;
end
if(YCOUNTER_BITS < $clog2(1+INIT_YON)) begin
$error("YCounter size too small to accommodate ON count.");
fail = 1;
end
if(YCOUNTER_BITS < $clog2(1+INIT_YOFF)) begin
$error("YCounter size too small to accommodate OFF count.");
fail = 1;
end
if((INIT_XEND < INIT_XON) || (INIT_XOFF <= INIT_XON)) begin
$warning("Initial empty X output range.");
end
if((INIT_YEND < INIT_YON) || (INIT_YOFF <= INIT_YON)) begin
$warning("Initial empty Y output range.");
end
if(fail) $finish();
end
//-----------------------------------------------------------------------
// Dynamically configurable state
typedef logic [XCOUNTER_BITS-1:0] xcount_t;
xcount_t XEnd = INIT_XEND;
xcount_t XOn = INIT_XON;
xcount_t XOff = INIT_XOFF;
typedef logic [YCOUNTER_BITS-1:0] ycount_t;
ycount_t YEnd = INIT_YEND;
ycount_t YOn = INIT_YON;
ycount_t YOff = INIT_YOFF;
always_ff @(posedge clk) begin
if(we) begin
unique case(wa)
0*4: XOn <= wd;
1*4: XOff <= wd;
2*4: XEnd <= wd;
3*4: YOn <= wd;
4*4: YOff <= wd;
5*4: YEnd <= wd;
default: assert(0) else begin
$error("Illegal write address.");
$stop;
end
endcase
end
end
//-----------------------------------------------------------------------
// Cascaded enables for the nested counters: SCount, XCount, YCount
uwire sen;
uwire xen;
uwire yen;
//- S-Counter: SIMD fold ------------
initial begin
if((NUM_CHANNELS < 1) || (NUM_CHANNELS % SIMD != 0)) begin
$error("Channel count must be SIMD multiple.");
$finish;
end
end
// Count SF-2, SF-3, ..., 1, 0, -1
localparam int unsigned SF = NUM_CHANNELS/SIMD;
typedef logic [$clog2(SF-1):0] scount_t;
scount_t SCount = SF-2;
assign xen = sen && SCount[$left(SCount)];
uwire sclr = rst || xen;
always_ff @(posedge clk) begin
if(sclr) SCount <= SF-2;
else if(sen) SCount <= SCount - 1;
end
//- X-Counter: image width ----------
xcount_t XCount = 0;
assign yen = xen && (XCount == XEnd);
uwire xclr = rst || yen;
always_ff @(posedge clk) begin
if(xclr) XCount <= 0;
else if(xen) XCount <= XCount + 1;
end
uwire xfwd = (XOn <= XCount) && (XCount < XOff);
//- Y-Counter: image height ---------
ycount_t YCount = 0;
uwire yclr = rst || (yen && (YCount == YEnd));
always_ff @(posedge clk) begin
if(yclr) YCount <= 0;
else if(yen) YCount <= YCount + 1;
end
uwire yfwd = (YOn <= YCount) && (YCount < YOff);
//-----------------------------------------------------------------------
// Input forwarding and edge padding
typedef struct {
logic vld;
logic [STREAM_BITS-1:0] dat;
} buf_t;
buf_t A = '{ vld: 0, dat: 'x };
buf_t B = '{ vld: 0, dat: 'x };
uwire fwd = xfwd && yfwd;
assign sen = (m_axis_tready || !B.vld) && (s_axis_tvalid || A.vld || !fwd);
assign s_axis_tready = !A.vld;
assign m_axis_tvalid = B.vld;
assign m_axis_tdata = B.dat;
always_ff @(posedge clk) begin
if(rst) begin
B <= '{ vld: 0, dat: 'x };
end
else if(m_axis_tready || !B.vld) begin
B.vld <= s_axis_tvalid || A.vld || !fwd;
B.dat <= !fwd? '0 : A.vld? A.dat : s_axis_tdata;
end
end
always_ff @(posedge clk) begin
if(rst) begin
A <= '{ vld: 0, dat: 'x };
end
else begin
A.vld <= (A.vld || s_axis_tvalid) && ((B.vld && !m_axis_tready) || !fwd);
if(!A.vld) A.dat <= s_axis_tdata;
end
end
endmodule : fmpadding
/******************************************************************************
* Copyright (C) 2022, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @brief Feature map padding.
* @author Thomas B. Preußer <tpreusse@amd.com>
*****************************************************************************/
module fmpadding_axi #(
int unsigned XCOUNTER_BITS,
int unsigned YCOUNTER_BITS,
int unsigned NUM_CHANNELS,
int unsigned SIMD,
int unsigned ELEM_BITS,
int unsigned INIT_XON,
int unsigned INIT_XOFF,
int unsigned INIT_XEND,
int unsigned INIT_YON,
int unsigned INIT_YOFF,
int unsigned INIT_YEND,
localparam int unsigned STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
)(
//- Global Control ------------------
input logic ap_clk,
input logic ap_rst_n,
//- AXI Lite ------------------------
// Writing
input s_axilite_AWVALID,
output s_axilite_AWREADY,
input [4:0] s_axilite_AWADDR,
input s_axilite_WVALID,
output s_axilite_WREADY,
input [31:0] s_axilite_WDATA,
input [ 3:0] s_axilite_WSTRB,
output s_axilite_BVALID,
input s_axilite_BREADY,
output [1:0] s_axilite_BRESP,
// Reading
input s_axilite_ARVALID,
output s_axilite_ARREADY,
input [4:0] s_axilite_ARADDR,
output s_axilite_RVALID,
input s_axilite_RREADY,
output [31:0] s_axilite_RDATA,
output [ 1:0] s_axilite_RRESP,
//- AXI Stream - Input --------------
output logic s_axis_tready,
input logic s_axis_tvalid,
input logic [STREAM_BITS-1:0] s_axis_tdata,
//- AXI Stream - Output -------------
input logic m_axis_tready,
output logic m_axis_tvalid,
output logic [STREAM_BITS-1:0] m_axis_tdata
);
// AXI-Lite Adapter
uwire we;
uwire [ 4:0] wa;
uwire [31:0] wd;
axi2we #(.ADDR_BITS(5)) axilight_adapter (
.ap_clk, .ap_rst_n,
.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB,
.s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP,
.s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR,
.s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP,
.we, .wa, .wd
);
// Actual Padding
fmpadding #(
.XCOUNTER_BITS(XCOUNTER_BITS), .YCOUNTER_BITS(YCOUNTER_BITS),
.NUM_CHANNELS(NUM_CHANNELS), .SIMD(SIMD),
.INIT_XON(INIT_XON), .INIT_XOFF(INIT_XOFF), .INIT_XEND(INIT_XEND),
.INIT_YON(INIT_YON), .INIT_YOFF(INIT_YOFF), .INIT_YEND(INIT_YEND),
.ELEM_BITS(ELEM_BITS)
) padding (
.ap_clk, .ap_rst_n,
.we, .wa, .wd,
.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
);
endmodule : fmpadding_axi
module fmpadding_axi_tb #(
int unsigned XCOUNTER_BITS = 8,
int unsigned YCOUNTER_BITS = 8,
int unsigned NUM_CHANNELS = 4,
int unsigned SIMD = 2,
int unsigned ELEM_BITS = 4
)();
localparam int unsigned STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8);
//- Global Control ------------------
logic clk = 0;
always #5ns clk = !clk;
logic rst;
// AXI-Light for Parameter Configuration
logic s_axilite_AWVALID;
uwire s_axilite_AWREADY;
logic [2:0] s_axilite_AWADDR;
logic s_axilite_WVALID;
uwire s_axilite_WREADY;
logic [31:0] s_axilite_WDATA;
//- AXI Stream - Input --------------
uwire s_axis_tready;
logic s_axis_tvalid;
logic [STREAM_BITS-1:0] s_axis_tdata;
//- AXI Stream - Output -------------
logic m_axis_tready;
uwire m_axis_tvalid;
uwire [STREAM_BITS-1:0] m_axis_tdata;
// DUT
fmpadding_axi #(
.XCOUNTER_BITS(XCOUNTER_BITS),
.YCOUNTER_BITS(YCOUNTER_BITS),
.NUM_CHANNELS(NUM_CHANNELS),
.SIMD(SIMD),
.INIT_XON(0), .INIT_XOFF(0), .INIT_XEND(0),
.INIT_YON(0), .INIT_YOFF(0), .INIT_YEND(0),
.ELEM_BITS(ELEM_BITS)
) dut (
.ap_clk(clk), .ap_rst_n(!rst),
.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1),
.s_axilite_BVALID(), .s_axilite_BREADY('1), .s_axilite_BRESP(),
.s_axilite_ARVALID('0), .s_axilite_ARREADY(), .s_axilite_ARADDR('x),
.s_axilite_RVALID(), .s_axilite_RREADY('0), .s_axilite_RDATA(), .s_axilite_RRESP(),
.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
);
// Stimuli
localparam int unsigned IMAGES = 2;
localparam int unsigned XSIZE = 10;
localparam int unsigned YSIZE = 7;
localparam int unsigned PAD_LEFT = 2;
localparam int unsigned PAD_RIGHT = 3;
localparam int unsigned PAD_TOP = 1;
localparam int unsigned PAD_BOTTOM = 2;
task axi_write(input logic [2:0] wa, input logic [31:0] wd);
s_axilite_AWVALID <= 1;
s_axilite_AWADDR <= wa;
@(posedge clk iff s_axilite_AWREADY);
s_axilite_AWVALID <= 0;
s_axilite_AWADDR <= 'x;
s_axilite_WVALID <= 1;
s_axilite_WDATA <= wd;
@(posedge clk iff s_axilite_WREADY);
s_axilite_WVALID <= 0;
s_axilite_WDATA <= 'x;
endtask : axi_write
initial begin
s_axilite_AWVALID = 0;
s_axilite_AWADDR = 'x;
s_axilite_WVALID = 0;
s_axilite_WDATA = 'x;
s_axis_tvalid = 0;
s_axis_tdata = 'x;
// Configure Parameters
rst = 0;
@(posedge clk);
/* XOn */ axi_write(0, PAD_LEFT);
/* XOff */ axi_write(1, XSIZE - PAD_RIGHT);
/* XEnd */ axi_write(2, XSIZE - 1);
/* YOn */ axi_write(4, PAD_TOP);
/* YOff */ axi_write(5, YSIZE - PAD_BOTTOM);
/* YEnd */ axi_write(6, YSIZE - 1);
@(posedge clk);
rst <= 1;
@(posedge clk);
rst <= 0;
@(posedge clk);
// Feed data input
s_axis_tvalid <= 1;
for(int unsigned i = 0; i < IMAGES * (XSIZE-PAD_LEFT-PAD_RIGHT) * (YSIZE-PAD_TOP-PAD_BOTTOM) * (NUM_CHANNELS/SIMD); i++) begin
s_axis_tdata <= i;
@(posedge clk iff s_axis_tready);
if($urandom()%5 == 0) begin
s_axis_tvalid <= 0;
s_axis_tdata <= 'x;
@(posedge clk);
s_axis_tvalid <= 1;
end
end
s_axis_tvalid <= 0;
s_axis_tdata <= 'x;
end
// Output Throttler
initial begin
m_axis_tready = 0;
@(posedge clk iff !rst);
m_axis_tready <= 1;
forever @(posedge clk iff m_axis_tvalid) begin
m_axis_tready <= 0;
repeat(4-$clog2(1+$urandom()%15)) @(posedge clk);
m_axis_tready <= 1;
end
end
// Output logger
initial begin
@(negedge rst);
repeat(IMAGES) begin
for(int unsigned y = 0; y < YSIZE; y++) begin
for(int unsigned x = 0; x < XSIZE; x++) begin
automatic string delim = " ";
for(int unsigned s = 0; s < NUM_CHANNELS/SIMD; s++) begin
@(posedge clk iff m_axis_tvalid && m_axis_tready);
$write("%s%02X", delim, m_axis_tdata);
delim = ":";
end
end
$display();
end
$display("----");
end
$finish;
end
endmodule : fmpadding_axi_tb
/******************************************************************************
* Copyright (C) 2022, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
module $TOP_MODULE_NAME$(
//- Global Control ------------------
(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
input ap_clk,
(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
input ap_rst_n,
//- AXI Lite ------------------------
// Writing
input s_axilite_AWVALID,
output s_axilite_AWREADY,
input [4:0] s_axilite_AWADDR,
input s_axilite_WVALID,
output s_axilite_WREADY,
input [31:0] s_axilite_WDATA,
input [ 3:0] s_axilite_WSTRB,
output s_axilite_BVALID,
input s_axilite_BREADY,
output [1:0] s_axilite_BRESP,
// Reading
input s_axilite_ARVALID,
output s_axilite_ARREADY,
input [4:0] s_axilite_ARADDR,
output s_axilite_RVALID,
input s_axilite_RREADY,
output [31:0] s_axilite_RDATA,
output [ 1:0] s_axilite_RRESP,
//- AXI Stream - Input --------------
output in0_V_TREADY,
input in0_V_TVALID,
input [$STREAM_BITS$-1:0] in0_V_TDATA,
//- AXI Stream - Output -------------
input out_V_TREADY,
output out_V_TVALID,
output [$STREAM_BITS$-1:0] out_V_TDATA
);
fmpadding_axi #(
.XCOUNTER_BITS($XCOUNTER_BITS$),
.YCOUNTER_BITS($YCOUNTER_BITS$),
.NUM_CHANNELS($NUM_CHANNELS$),
.SIMD($SIMD$),
.ELEM_BITS($ELEM_BITS$),
.INIT_XON($INIT_XON$),
.INIT_XOFF($INIT_XOFF$),
.INIT_XEND($INIT_XEND$),
.INIT_YON($INIT_YON$),
.INIT_YOFF($INIT_YOFF$),
.INIT_YEND($INIT_YEND$)
)
$TOP_MODULE_NAME$_impl
(
.ap_clk(ap_clk),
.ap_rst_n(ap_rst_n),
.s_axilite_AWVALID(s_axilite_AWVALID),
.s_axilite_AWREADY(s_axilite_AWREADY),
.s_axilite_AWADDR(s_axilite_AWADDR),
.s_axilite_WVALID(s_axilite_WVALID),
.s_axilite_WREADY(s_axilite_WREADY),
.s_axilite_WDATA(s_axilite_WDATA),
.s_axilite_WSTRB(s_axilite_WSTRB),
.s_axilite_BVALID(s_axilite_BVALID),
.s_axilite_BREADY(s_axilite_BREADY),
.s_axilite_BRESP(s_axilite_BRESP),
.s_axilite_ARVALID(s_axilite_ARVALID),
.s_axilite_ARREADY(s_axilite_ARREADY),
.s_axilite_ARADDR(s_axilite_ARADDR),
.s_axilite_RVALID(s_axilite_RVALID),
.s_axilite_RREADY(s_axilite_RREADY),
.s_axilite_RDATA(s_axilite_RDATA),
.s_axilite_RRESP(s_axilite_RRESP),
.s_axis_tready(in0_V_TREADY),
.s_axis_tvalid(in0_V_TVALID),
.s_axis_tdata(in0_V_TDATA),
.m_axis_tready(out_V_TREADY),
.m_axis_tvalid(out_V_TVALID),
.m_axis_tdata(out_V_TDATA)
);
endmodule
......@@ -43,6 +43,7 @@ from finn.custom_op.fpgadataflow.downsampler import DownSampler
from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise
from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl
from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
from finn.custom_op.fpgadataflow.iodma import IODMA
from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
......@@ -91,3 +92,4 @@ custom_op["Lookup"] = Lookup
custom_op["StreamingConcat"] = StreamingConcat
custom_op["CheckSum"] = CheckSum
custom_op["StreamingEltwise"] = StreamingEltwise
custom_op["FMPadding_rtl"] = FMPadding_rtl
# Copyright (C) 2022, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import math
import numpy as np
import os
import shutil
import warnings
from qonnx.core.datatype import DataType
from qonnx.util.basic import roundup_to_integer_multiple
from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
try:
from pyverilator import PyVerilator
except ModuleNotFoundError:
PyVerilator = None
class FMPadding_rtl(HLSCustomOp):
"""CustomOp wrapper for the finn-rtllib fmpadding_axi component
Supports adjusting the padding amount and spatial feature sizes at
runtime."""
def __init__(self, onnx_node):
super().__init__(onnx_node)
def get_nodeattr_types(self):
my_attrs = {
# spatial size of input images
"ImgDim": ("ints", True, []), # [H, W] = [Y, X]
# total padding (per dimension) to apply
"Padding": (
"ints",
True,
[1, 1, 1, 1],
), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
# number of channels in input image
"NumChannels": ("i", True, 0),
# SIMD Input parallelism
"SIMD": ("i", False, 1),
# FINN input datatype
"inputDataType": ("s", True, ""),
# shape describing input vecs per execution
"numInputVectors": ("i", False, 1),
# Enable reprogrammable implementation to change FM dimensions,
# stride, or dilation during runtime
"dynamic_mode": ("i", False, 0, {0, 1}),
# attribute to save top module name - not user configurable
"gen_top_module": ("s", False, ""),
}
my_attrs.update(super().get_nodeattr_types())
return my_attrs
def get_padded_odim(self):
"Return the padded spatial size of the output."
idim_h, idim_w = self.get_nodeattr("ImgDim")
pad = self.get_nodeattr("Padding")
pad_h = pad[0] + pad[2]
pad_w = pad[1] + pad[3]
odim_h = idim_h + pad_h
odim_w = idim_w + pad_w
return [odim_h, odim_w]
def get_exp_cycles(self):
odim_h, odim_w = self.get_padded_odim()
channels = self.get_nodeattr("NumChannels")
simd = self.get_nodeattr("SIMD")
batch_size = self.get_nodeattr("numInputVectors")
exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
return int(exp_cycles)
def get_normal_input_shape(self, ind=0):
idim_h, idim_w = self.get_nodeattr("ImgDim")
num_ch = self.get_nodeattr("NumChannels")
ishape = (1, idim_h, idim_w, num_ch)
return ishape
def get_normal_output_shape(self, ind=0):
odim_h, odim_w = self.get_padded_odim()
num_ch = self.get_nodeattr("NumChannels")
oshape = (1, odim_h, odim_w, num_ch)
return oshape
def get_folded_input_shape(self, ind=0):
normal_ishape = list(self.get_normal_input_shape())
ifm_ch = self.get_nodeattr("NumChannels")
simd = self.get_nodeattr("SIMD")
assert ifm_ch % simd == 0, "SIMD must divide input channels"
fold = int(normal_ishape[-1] / simd)
folded_ishape = normal_ishape[:-1] + [fold, simd]
return tuple(folded_ishape)
def get_folded_output_shape(self, ind=0):
normal_oshape = list(self.get_normal_output_shape())
ifm_ch = self.get_nodeattr("NumChannels")
simd = self.get_nodeattr("SIMD")
assert ifm_ch % simd == 0, "SIMD must divide input channels"
fold = int(normal_oshape[-1] / simd)
folded_oshape = normal_oshape[:-1] + [fold, simd]
return tuple(folded_oshape)
def make_shape_compatible_op(self, model):
exp_ishape = self.get_normal_input_shape()
oshape = self.get_normal_output_shape()
ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl."
return super().make_const_shape_op(oshape)
def infer_node_datatype(self, model):
node = self.onnx_node
idt = model.get_tensor_datatype(node.input[0])
if idt != self.get_input_datatype():
warn_str = "inputDataType changing for %s: %s -> %s " % (
node.name,
str(self.get_input_datatype()),
str(idt),
)
warnings.warn(warn_str)
self.set_nodeattr("inputDataType", idt.name)
model.set_tensor_datatype(node.output[0], idt)
def verify_node(self):
pass
def get_input_datatype(self, ind=0):
"""Returns FINN DataType of input."""
ret = DataType[self.get_nodeattr("inputDataType")]
# the hlslib op always pads with zeros, so ensure that the DataType
# is able to represent zeros
assert ret.allowed(0), "FMPadding_rtl DataType must support zero"
return ret
def get_output_datatype(self, ind=0):
"""Returns FINN DataType of output. (Same as input datatype)"""
return self.get_input_datatype()
def get_instream_width(self, ind=0):
ibits = self.get_input_datatype().bitwidth()
simd = self.get_nodeattr("SIMD")
return ibits * simd
def get_outstream_width(self, ind=0):
obits = self.get_output_datatype().bitwidth()
simd = self.get_nodeattr("SIMD")
return obits * simd
def get_number_output_values(self):
folded_oshape = self.get_folded_output_shape()
return np.prod(folded_oshape[:-1])
def get_verilog_top_module_intf_names(self):
# Overload default HLSCustomOp implementation to add axilite control IF
intf_names = super().get_verilog_top_module_intf_names()
if self.get_nodeattr("dynamic_mode"):
intf_names["axilite"] = ["s_axilite"]
return intf_names
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
node = self.onnx_node
exp_ishape = self.get_normal_input_shape()
exp_oshape = self.get_normal_output_shape()
folded_ishape = self.get_folded_input_shape()
if mode == "cppsim":
raise Exception(
"cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim"
)
elif mode == "rtlsim":
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
else:
raise Exception(
"""Invalid value for attribute exec_mode! Is currently set to: {}
has to be set to one of the following value ("cppsim", "rtlsim")""".format(
mode
)
)
inp = context[node.input[0]]
assert str(inp.dtype) == "float32", "Input datatype is not float32"
assert (
inp.shape == exp_ishape
), """Input shape doesn't
match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
export_idt = self.get_input_datatype()
reshaped_input = inp.reshape(folded_ishape)
np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
sim = self.get_rtlsim()
nbits = self.get_instream_width()
rtlsim_inp = npy_to_rtlsim_input(
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
super().toggle_clk(sim)
rtlsim_output = self.rtlsim(sim, rtlsim_inp)
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
out_npy_path = "{}/output.npy".format(code_gen_dir)
out_shape = self.get_folded_output_shape()
rtlsim_output_to_npy(
rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
)
# load and reshape output
output = np.load(out_npy_path)
output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
context[node.output[0]] = output
assert (
context[node.output[0]].shape == exp_oshape
), """Output shape doesn't match expected shape
(1, OutputDim_H, OutputDim_W, NumChannels)."""
def get_template_values(self, ifm_dims, pads, chans, simd, idt):
dimY, dimX = ifm_dims
padT, padL, padB, padR = pads
y_counter_bits = int(math.ceil(math.log2(padT + dimY + padB + 1)))
x_counter_bits = int(math.ceil(math.log2(padL + dimX + padR + 1)))
topname = self.get_verilog_top_module_name()
stream_bits = idt.bitwidth() * simd
stream_bits = int(roundup_to_integer_multiple(stream_bits, 8))
code_gen_dict = {
"XCOUNTER_BITS": int(x_counter_bits),
"YCOUNTER_BITS": int(y_counter_bits),
"NUM_CHANNELS": int(chans),
"SIMD": int(simd),
"ELEM_BITS": idt.bitwidth(),
"TOP_MODULE_NAME": topname,
"INIT_XON": int(padL),
"INIT_XOFF": int(padL + dimX),
"INIT_XEND": int(padL + dimX + padR - 1),
"INIT_YON": int(padT),
"INIT_YOFF": int(padT + dimY),
"INIT_YEND": int(padT + dimY + padB - 1),
"STREAM_BITS": int(stream_bits),
}
return code_gen_dict
def get_dynamic_config(self, ifm_dims=None, pads=None):
"""Returns a configuration dict to re-configure FM dimension and
padding amounts during runtime."""
if ifm_dims is None:
ifm_dims = self.get_nodeattr("ImgDim")
if pads is None:
pads = self.get_nodeattr("Padding")
chans = self.get_nodeattr("NumChannels")
simd = self.get_nodeattr("SIMD")
idt = self.get_input_datatype()
code_gen_dict = self.get_template_values(ifm_dims, pads, chans, simd, idt)
config = {
"XON": (0 * 4, (code_gen_dict["INIT_XON"])),
"XOFF": (1 * 4, (code_gen_dict["INIT_XOFF"])),
"XEND": (2 * 4, (code_gen_dict["INIT_XEND"])),
"YON": (3 * 4, (code_gen_dict["INIT_YON"])),
"YOFF": (4 * 4, (code_gen_dict["INIT_YOFF"])),
"YEND": (5 * 4, (code_gen_dict["INIT_YEND"])),
}
return config
def generate_hdl(self):
rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl"
template_path = rtlsrc + "/fmpadding_template.v"
dims = self.get_nodeattr("ImgDim")
pads = self.get_nodeattr("Padding")
chans = self.get_nodeattr("NumChannels")
simd = self.get_nodeattr("SIMD")
idt = self.get_input_datatype()
code_gen_dict = self.get_template_values(dims, pads, chans, simd, idt)
# save top module name so we can refer to it after this node has been renamed
# (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
# apply code generation to templates
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
with open(template_path, "r") as f:
template = f.read()
for key_name in code_gen_dict:
key = "$%s$" % key_name
template = template.replace(key, str(code_gen_dict[key_name]))
with open(
os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"),
"w",
) as f:
f.write(template)
sv_files = ["fmpadding_axi.sv", "fmpadding.sv", "axi2we.sv"]
for sv_file in sv_files:
shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir)
# set ipgen_path and ip_path so that HLS-Synth transformation
# and stich_ip transformation do not complain
self.set_nodeattr("ipgen_path", code_gen_dir)
self.set_nodeattr("ip_path", code_gen_dir)
def prepare_rtlsim(self):
"""Creates a Verilator emulation library for the RTL code generated
for this node, sets the rtlsim_so attribute to its path and returns
a PyVerilator wrapper around it."""
# Modified to use generated (System-)Verilog instead of HLS output products
if PyVerilator is None:
raise ImportError("Installation of PyVerilator is required.")
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
verilog_paths = [code_gen_dir]
verilog_files = [
"fmpadding_axi.sv",
"fmpadding.sv",
"axi2we.sv",
self.get_nodeattr("gen_top_module") + ".v",
]
# build the Verilator emu library
sim = PyVerilator.build(
verilog_files,
build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
verilog_path=verilog_paths,
trace_depth=get_rtlsim_trace_depth(),
top_module_name=self.get_verilog_top_module_name(),
)
# save generated lib filename in attribute
self.set_nodeattr("rtlsim_so", sim.lib._name)
return sim
def code_generation_ipi(self):
"""Constructs and returns the TCL for node instantiation in Vivado IPI."""
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
sourcefiles = [
"fmpadding_axi.sv",
"fmpadding.sv",
"axi2we.sv",
self.get_nodeattr("gen_top_module") + ".v",
]
sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
cmd = []
for f in sourcefiles:
cmd += ["add_files -norecurse %s" % (f)]
cmd += [
"create_bd_cell -type module -reference %s %s"
% (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
]
return cmd
def code_generation_ipgen(self, model, fpgapart, clk):
"""Normally: Generates C++ code and tcl script for IP generation.
Here: Generates (System-)Verilog code for IP generation."""
self.generate_hdl()
def ipgen_singlenode_code(self):
"""Normally: Builds the bash script for IP generation."""
pass
def code_generation_cppsim(self, model):
"""Normally: Generates C++ code for simulation (cppsim)."""
pass
def compile_singlenode_code(self):
pass
def global_includes(self):
pass
def defines(self, var):
pass
def read_npy_data(self):
pass
def strm_decl(self):
pass
def docompute(self):
pass
def dataoutstrm(self):
pass
def save_as_npy(self):
pass
def blackboxfunction(self):
pass
def pragmas(self):
pass
......@@ -117,8 +117,12 @@ class InferConvInpGen(Transformation):
ConvInpGen_idim_h = odim_padding_h
ConvInpGen_idim_w = odim_padding_w
padding_optype = (
"FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch"
)
padding_node = helper.make_node(
"FMPadding_Batch",
padding_optype,
[i2c_input],
[padding_out],
domain="finn.custom_op.fpgadataflow",
......
......@@ -41,7 +41,10 @@ from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
from qonnx.transformation.infer_datatypes import InferDataTypes
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.lower_convs_to_matmul import (
LowerConvsToMatMul,
_auto_pad_to_explicit_padding,
)
from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
import finn.core.onnx_exec as oxe
......@@ -54,25 +57,48 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
)
from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
from finn.util.basic import pyverilate_get_liveness_threshold_cycles
def create_conv_model(idim, ifm, k, stride, ofm, idt, wdt):
def create_conv_model(
idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
):
np.random.seed(0)
ishp = (1, ifm, idim, idim)
int_dim = compute_conv_output_dim(idim, k, stride)
odim = compute_conv_output_dim(int_dim, k, stride)
oshp = (1, ofm, odim, odim)
wshp = (ofm, ifm, k, k)
wshp_1 = (ofm, ofm, k, k)
group = ifm if depthwise else 1
group_str = str(group)
ishp = (1, ifm, idim_h, idim_w)
pad_0 = _auto_pad_to_explicit_padding(
pad_mode, idim_h, idim_w, k, k, stride, stride, 2
)
int_dim_h = compute_conv_output_dim(
idim_h, k, stride, total_pad=pad_0[0] + pad_0[2]
)
int_dim_w = compute_conv_output_dim(
idim_w, k, stride, total_pad=pad_0[1] + pad_0[3]
)
pad_1 = _auto_pad_to_explicit_padding(
pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2
)
odim_h = compute_conv_output_dim(
int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2]
)
odim_w = compute_conv_output_dim(
int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3]
)
oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)
wshp = (ifm, 1, k, k) if depthwise else (ofm, ifm, k, k)
wshp_1 = (ifm, 1, k, k) if depthwise else (ofm, ofm, k, k)
ishp_str = str(list(ishp))
oshp_str = str(list(oshp))
wshp_str = str(list(wshp))
wshp_1_str = str(list(wshp_1))
kshp_str = str([k, k])
pad_str = str([0, 0, 0, 0])
pad_0_str = str(list(pad_0))
pad_1_str = str(list(pad_1))
stride_str = str([stride, stride])
dil_str = str([1, 1])
......@@ -88,11 +114,11 @@ def create_conv_model(idim, ifm, k, stride, ofm, idt, wdt):
>
{{
conv0 = Conv<
dilations={dil_str},group=1,kernel_shape={kshp_str},pads={pad_str},
dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_0_str},
strides={stride_str}
>(in0, param_c0_weight)
out0 = Conv<
dilations={dil_str},group=1,kernel_shape={kshp_str},pads={pad_str},
dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_1_str},
strides={stride_str}
>(conv0, param_c1_weight)
}}
......@@ -109,17 +135,19 @@ def create_conv_model(idim, ifm, k, stride, ofm, idt, wdt):
return model
def update_conv_model_dims(model, idim_new):
def update_conv_model_dims(model, idim_new_h, idim_new_w):
cnode = model.get_nodes_by_op_type("Conv")[0]
k, _ = get_by_name(cnode.attribute, "kernel_shape").ints
stride, _ = get_by_name(cnode.attribute, "strides").ints
ishp = model.get_tensor_shape("in0")
n, ci, _, _ = ishp
n, co, _, _ = model.get_tensor_shape("out0")
int_dim = compute_conv_output_dim(idim_new, k, stride)
odim = compute_conv_output_dim(int_dim, k, stride)
model.set_tensor_shape("in0", (n, ci, idim_new, idim_new))
model.set_tensor_shape("out0", (n, co, odim, odim))
int_dim_h = compute_conv_output_dim(idim_new_h, k, stride)
int_dim_w = compute_conv_output_dim(idim_new_w, k, stride)
odim_h = compute_conv_output_dim(int_dim_h, k, stride)
odim_w = compute_conv_output_dim(int_dim_w, k, stride)
model.set_tensor_shape("in0", (n, ci, idim_new_h, idim_new_w))
model.set_tensor_shape("out0", (n, co, odim_h, odim_w))
# remove all existing shapes
del model.graph.value_info[:]
model = model.transform(InferShapes())
......@@ -142,43 +170,87 @@ def config_hook(configs):
return None
def write_swg_config(sim):
reset_rtlsim(sim)
for axi_name, config in configs:
# 1. Write config registers to the SWG, dict defines (addr, value) tuples
# Write config registers to the SWG/FMPadding dict
# defines (addr, value) tuples
for config_entry in config.values():
axilite_write(sim, config_entry[0], config_entry[1], basename=axi_name)
# 2. Set cfg_valid flag (>= 1 cycle)
axilite_write(sim, 0, 1, basename=axi_name)
# 3. Reset component (>= 1 cycle)
reset_rtlsim(sim)
return write_swg_config
cfg0 = {
"idims": [(32, 32), (8, 8)],
"ifm": 64,
"k": 3,
"stride": 1,
"ofm": 64,
"depthwise": True,
"pad_mode": "SAME_UPPER",
}
cfg1 = {
"idims": [(32, 16), (16, 8)],
"ifm": 4,
"k": 4,
"stride": 1,
"ofm": 8,
"depthwise": False,
"pad_mode": "SAME_UPPER",
}
cfg2 = {
"idims": [(64, 128), (2, 4)],
"ifm": 64,
"k": 3,
"stride": 1,
"ofm": 64,
"depthwise": True,
"pad_mode": "SAME_UPPER",
}
@pytest.mark.parametrize("cfg", [cfg0, cfg1, cfg2])
@pytest.mark.slow
@pytest.mark.vivado
@pytest.mark.fpgadataflow
def test_fpgadataflow_conv_dynamic():
idims = [32, 16]
ifm = 4
k = 4
stride = 1
ofm = 8
idt = DataType["UINT8"]
def test_fpgadataflow_conv_dynamic(cfg):
pad_mode = cfg["pad_mode"]
depthwise = cfg["depthwise"]
idims = cfg["idims"]
ifm = cfg["ifm"]
k = cfg["k"]
stride = cfg["stride"]
ofm = cfg["ofm"]
idt = DataType["UINT4"]
wdt = DataType["INT2"]
exp_cfgs = []
largest_model = None
for idim in idims:
ishp = (1, ifm, idim, idim)
idim_h, idim_w = idim
ishp = (1, ifm, idim_h, idim_w)
np.random.seed(0)
inp = gen_finn_dt_tensor(idt, ishp)
model = create_conv_model(idim, ifm, k, stride, ofm, idt, wdt)
_, _, int_dim, _ = model.get_tensor_shape("conv0")
_, _, odim, _ = model.get_tensor_shape("out0")
model = create_conv_model(
idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
)
_, _, int_dim_h, int_dim_w = model.get_tensor_shape("conv0")
_, _, odim_h, odim_w = model.get_tensor_shape("out0")
pad0 = get_by_name(model.graph.node[0].attribute, "pads").ints
pad1 = get_by_name(model.graph.node[1].attribute, "pads").ints
if idim == max(idims):
# use largest model for hardware conversion
largest_model = copy.deepcopy(model)
golden = execute_onnx(model, {"in0": inp})["out0"]
exp_cfg = (idim, int_dim, odim, inp, golden)
exp_cfg = (
(idim_h, idim_w),
(int_dim_h, int_dim_w),
(odim_h, odim_w),
pad0,
pad1,
inp,
golden,
)
exp_cfgs.append(exp_cfg)
# convert to hardware and prepare simulation
......@@ -187,17 +259,34 @@ def test_fpgadataflow_conv_dynamic():
model = model.transform(
to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")
)
model = model.transform(to_hls.InferVectorVectorActivation())
model = model.transform(absorb.AbsorbConsecutiveTransposes())
parent_model = model.transform(CreateDataflowPartition())
sdp_inst = getCustomOp(
parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
)
model = ModelWrapper(sdp_inst.get_nodeattr("model"))
for swg_node in model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl"):
getCustomOp(swg_node).set_nodeattr("SIMD", 1)
assert len(model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")) == 2
if pad_mode == "VALID":
assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 0
else:
assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 2
dyn_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
dyn_nodes += model.get_nodes_by_op_type("FMPadding_rtl")
for swg_node in dyn_nodes:
getCustomOp(swg_node).set_nodeattr("SIMD", 4)
getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1)
getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16])
getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16])
comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation")
comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation")
for comp_node in comp_nodes:
if depthwise:
getCustomOp(comp_node).set_nodeattr("PE", 4)
else:
getCustomOp(comp_node).set_nodeattr("SIMD", 4)
getCustomOp(comp_node).set_nodeattr("PE", 4)
model = model.transform(InsertDWC())
model = model.transform(InsertFIFO())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
......@@ -208,31 +297,64 @@ def test_fpgadataflow_conv_dynamic():
# loop through experiment configurations
for exp_cfg in exp_cfgs:
idim, int_dim, odim, inp, golden = exp_cfg
(
(idim_h, idim_w),
(int_dim_h, int_dim_w),
(odim_h, odim_w),
pad0,
pad1,
inp,
golden,
) = exp_cfg
conv0_idim_h = idim_h + pad0[0] + pad0[2]
conv0_idim_w = idim_w + pad0[1] + pad0[3]
conv1_idim_h = int_dim_h + pad1[0] + pad1[2]
conv1_idim_w = int_dim_w + pad1[1] + pad1[3]
# get config for the new dimensions
swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
swg0 = getCustomOp(swg_nodes[0])
update_tensor_dim(model, swg0.onnx_node.input[0], (idim, idim))
update_tensor_dim(model, swg0.onnx_node.output[0], (int_dim, int_dim))
config0 = swg0.get_dynamic_config((idim, idim))
update_tensor_dim(model, swg0.onnx_node.input[0], (conv0_idim_h, conv0_idim_w))
update_tensor_dim(model, swg0.onnx_node.output[0], (int_dim_h, int_dim_w))
swg_config0 = swg0.get_dynamic_config((conv0_idim_h, conv0_idim_w))
swg1 = getCustomOp(swg_nodes[1])
update_tensor_dim(model, swg1.onnx_node.input[0], (int_dim, int_dim))
update_tensor_dim(model, swg1.onnx_node.output[0], (odim, odim))
config1 = swg1.get_dynamic_config((int_dim, int_dim))
configs = [("s_axilite_0_", config0), ("s_axilite_1_", config1)]
update_tensor_dim(model, swg1.onnx_node.input[0], (conv1_idim_h, conv1_idim_w))
update_tensor_dim(model, swg1.onnx_node.output[0], (odim_h, odim_w))
swg_config1 = swg1.get_dynamic_config((conv1_idim_h, conv1_idim_w))
if pad_mode != "VALID":
pad_nodes = model.get_nodes_by_op_type("FMPadding_rtl")
padder0 = getCustomOp(pad_nodes[0])
update_tensor_dim(model, padder0.onnx_node.input[0], (idim_h, idim_w))
update_tensor_dim(
model, padder0.onnx_node.output[0], (conv0_idim_h, conv0_idim_w)
)
pad_config0 = padder0.get_dynamic_config((idim_h, idim_w), pad0)
padder1 = getCustomOp(pad_nodes[1])
update_tensor_dim(model, padder1.onnx_node.input[0], (int_dim_h, int_dim_w))
update_tensor_dim(
model, padder1.onnx_node.output[0], (conv1_idim_h, conv1_idim_w)
)
pad_config1 = padder1.get_dynamic_config((int_dim_h, int_dim_w), pad1)
configs = [
("s_axilite_0_", pad_config0),
("s_axilite_1_", swg_config0),
("s_axilite_2_", pad_config1),
("s_axilite_3_", swg_config1),
]
else:
configs = [("s_axilite_0_", swg_config0), ("s_axilite_1_", swg_config1)]
# adjust folded shapes for I/O FIFOs
# (since rtlsim_exec uses folded shape info to fold global i/o tensors)
first_node = getCustomOp(model.graph.node[0])
first_node_shp = list(first_node.get_folded_input_shape())
first_node_shp[1] = idim
first_node_shp[2] = idim
first_node_shp[1] = idim_h
first_node_shp[2] = idim_w
first_node.set_nodeattr("folded_shape", first_node_shp)
update_tensor_dim(model, first_node.onnx_node.input[0], (idim, idim))
update_tensor_dim(model, first_node.onnx_node.input[0], (idim_h, idim_w))
last_node = getCustomOp(model.graph.node[-1])
last_node_shp = list(last_node.get_folded_output_shape())
last_node_shp[1] = odim
last_node_shp[2] = odim
update_tensor_dim(model, last_node.onnx_node.output[0], (odim, odim))
last_node_shp[1] = odim_h
last_node_shp[2] = odim_w
update_tensor_dim(model, last_node.onnx_node.output[0], (odim_h, odim_w))
last_node.set_nodeattr("folded_shape", last_node_shp)
ctx = {"global_in": inp.transpose(0, 2, 3, 1)}
liveness_prev = pyverilate_get_liveness_threshold_cycles()
......
......@@ -53,7 +53,7 @@ test_fpga_part = pynq_part_map[test_pynq_board]
target_clk_ns = 10
def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt):
pad_h = padding[0] + padding[2]
pad_w = padding[1] + padding[3]
idim_h, idim_w = idim
......@@ -70,7 +70,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
)
FMPadding = helper.make_node(
"FMPadding_Batch",
optype,
["inp"],
["outp"],
domain="finn.custom_op.fpgadataflow",
......@@ -110,10 +110,14 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
# execution mode
@pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
# implementation style
@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
@pytest.mark.fpgadataflow
@pytest.mark.slow
@pytest.mark.vivado
def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style):
if impl_style == "rtl" and mode == "cppsim":
pytest.skip("rtl implstyle has no cppsim, skipping")
if num_ch % simd != 0:
pytest.skip(" num_ch % simd != 0, skipping")
......@@ -127,7 +131,9 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
odim_h = idim_h + pad_h
odim_w = idim_w + pad_w
model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt)
optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style]
model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt)
model = model.transform(InferShapes())
model = model.transform(SetExecMode(mode))
model = model.transform(GiveUniqueNodeNames())
......@@ -138,6 +144,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
model = model.transform(HLSSynthIP())
model = model.transform(PrepareRTLSim())
y_produced = oxe.execute_onnx(model, input_dict)["outp"]
expected_oshape = (1, odim_h, odim_w, num_ch)
assert y_produced.shape == expected_oshape
......@@ -149,7 +156,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
assert (y_produced == y_expected).all()
if mode == "rtlsim":
node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
node = model.get_nodes_by_op_type(optype)[0]
inst = getCustomOp(node)
cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
exp_cycles_dict = model.analysis(exp_cycles_per_layer)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment