diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v new file mode 100644 index 0000000000000000000000000000000000000000..13bd2b6ead805a62b0ef13f6f94a9b28665f985f --- /dev/null +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -0,0 +1,320 @@ +// original source: +// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v + + +// Copyright (c) 1999 The Regents of the University of California +// Copyright (c) 2010 The Regents of the University of Pennsylvania +// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London +// Copyright (c) 2020 Xilinx +// +// Permission to use, copy, modify, and distribute this software and +// its documentation for any purpose, without fee, and without a +// written agreement is hereby granted, provided that the above copyright +// notice and this paragraph and the following two paragraphs appear in +// all copies. +// +// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON +// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +// + +// Q_srl_oreg3_prefull_SIMPLE.v +// +// - In-page queue with parameterizable depth, bit width +// - Stream I/O is triple (data, valid, back-pressure), +// with EOS concatenated into the data +// - Flow control for input & output is combinationally decoupled +// - 2 <= depth <= 256 +// * (depth >= 2) is required to decouple I/O flow control, +// where empty => no produce, full => no consume, +// and depth 1 would ping-pong between the two at half rate +// * (depth <= 256) can be modified +// by changing ''synthesis loop_limit X'' below +// and changing ''addrwidth'' or its log computation +// - 1 <= width +// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice, +// plus output register (for fast output) +// - Queue addressing is done by ''addr'' up-down counter +// - Queue fullness is checked by comparator (addr==depth) +// - Queue fullness is pre-computed for next cycle +// - Queue input back-pressure is pre-computed for next cycle +// - Queue output valid (state!=state__empty) is pre-computed for next cycle +// (necessary since SRL data output reg requires non-boolean state) +// - FSM has 3 states (empty, one, more) +// - When empty, continue to emit most recently emitted value (for debugging) +// +// - Queue slots used = / (state==state_empty) ? 0 +// | (state==state_one) ? 1 +// \ (state==state_more) ? addr+2 +// - Queue slots used <= depth +// - Queue slots remaining = depth - used +// = / (state==state_empty) ? depth +// | (state==state_one) ? depth-1 +// \ (state==state_more) ? depth-2-addr +// +// - Synplify 7.1 / 8.0 +// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05 + + +`ifdef Q_srl +`else +`define Q_srl + + +module Q_srl (clock, reset, i_d, i_v, i_b, o_d, o_v, o_b, count); + + parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) + parameter width = 16; // - width of data (i_d, o_d) + + `define LOG2 ( (((depth)) ==0) ? 0 /* - depth==0 LOG2=0 */ \ + : (((depth-1)>>0)==0) ? 0 /* - depth<=1 LOG2=0 */ \ + : (((depth-1)>>1)==0) ? 1 /* - depth<=2 LOG2=1 */ \ + : (((depth-1)>>2)==0) ? 2 /* - depth<=4 LOG2=2 */ \ + : (((depth-1)>>3)==0) ? 3 /* - depth<=8 LOG2=3 */ \ + : (((depth-1)>>4)==0) ? 4 /* - depth<=16 LOG2=4 */ \ + : (((depth-1)>>5)==0) ? 5 /* - depth<=32 LOG2=5 */ \ + : (((depth-1)>>6)==0) ? 6 /* - depth<=64 LOG2=6 */ \ + : (((depth-1)>>7)==0) ? 7 /* - depth<=128 LOG2=7 */ \ + : 8) /* - depth<=256 LOG2=8 */ + +// parameter addrwidth = LOG2; // - width of queue addr + + parameter addrwidth = + ( (((depth)) ==0) ? 0 // - depth==0 LOG2=0 + : (((depth-1)>>0)==0) ? 0 // - depth<=1 LOG2=0 + : (((depth-1)>>1)==0) ? 1 // - depth<=2 LOG2=1 + : (((depth-1)>>2)==0) ? 2 // - depth<=4 LOG2=2 + : (((depth-1)>>3)==0) ? 3 // - depth<=8 LOG2=3 + : (((depth-1)>>4)==0) ? 4 // - depth<=16 LOG2=4 + : (((depth-1)>>5)==0) ? 5 // - depth<=32 LOG2=5 + : (((depth-1)>>6)==0) ? 6 // - depth<=64 LOG2=6 + : (((depth-1)>>7)==0) ? 7 // - depth<=128 LOG2=7 + : 8) // - depth<=256 LOG2=8 + ; + + input clock; + input reset; + + input [width-1:0] i_d; // - input stream data (concat data + eos) + input i_v; // - input stream valid + output i_b; // - input stream back-pressure + + output [width-1:0] o_d; // - output stream data (concat data + eos) + output o_v; // - output stream valid + input o_b; // - output stream back-pressure + + output [addrwidth:0] count; // - output number of elems in queue + + reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address + // for data output + reg shift_en_; // - SRL16 shift enable + reg [width-1:0] srl [depth-2:0]; // - SRL16 memory + reg shift_en_o_; // - SRLO shift enable + reg [width-1:0] srlo_, srlo // - SRLO output reg + /* synthesis syn_allow_retiming=0 */ ; + + parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED + parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo + parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo + // #items in srl = addr+2 + + reg [1:0] state, state_; // - state register + + wire addr_full_; // - true iff addr==depth-2 on NEXT cycle + reg addr_full; // - true iff addr==depth-2 + wire addr_zero_; // - true iff addr==0 + wire o_v_reg_; // - true iff state_empty on NEXT cycle + reg o_v_reg // - true iff state_empty + /* synthesis syn_allow_retiming=0 */ ; + wire i_b_reg_; // - true iff !full on NEXT cycle + reg i_b_reg // - true iff !full + /* synthesis syn_allow_retiming=0 */ ; + + assign addr_full_ = (state_==state_more) && (addr_==depth-2); + // - queue full + assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0) + assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty + assign i_b_reg_ = addr_full_; // - input bp if full + assign o_d = srlo; // - output data from queue + assign o_v = o_v_reg; // - output valid if non-empty + assign i_b = i_b_reg; // - input bp if full + + assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0)); + + // - ''always'' block with both FFs and SRL16 does not work, + // since FFs need reset but SRL16 does not + + always @(posedge clock) begin // - seq always: FFs + if (reset) begin + state <= state_empty; + addr <= 0; + addr_full <= 0; + o_v_reg <= 0; + i_b_reg <= 1; + end + else begin + state <= state_; + addr <= addr_; + addr_full <= addr_full_; + o_v_reg <= o_v_reg_; + i_b_reg <= i_b_reg_; + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srlo + // - infer enabled output reg at end of shift chain + // - input first element from i_d, all subsequent elements from SRL16 + if (reset) begin + srlo <= 0; + end + else begin + if (shift_en_o_) begin + srlo <= srlo_; + end + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srl + // - infer enabled SRL16E from shifting srl array + // - no reset capability; srl[] contents undefined on reset + if (shift_en_) begin + // synthesis loop_limit 256 + for (a_=depth-2; a_>0; a_=a_-1) begin + srl[a_] <= srl[a_-1]; + end + srl[0] <= i_d; + end + end // always @ (posedge clock or negedge reset) + + always @* begin // - combi always + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + case (state) + + state_empty: begin // - (empty, will not produce) + if (i_v) begin // - empty & i_v => consume + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else begin // - empty & !i_v => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end + + state_one: begin // - (contains one) + if (i_v && o_b) begin // - one & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && o_b) begin // - one & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end // case: state_one + + state_more: begin // - (contains more than one) + if (addr_full || (depth==2)) begin + // - (full, will not consume) + // - (full here if depth==2) + if (o_b) begin // - full & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else begin // - full & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; +// addr_ <= addr-1; +// state_ <= state_more; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end + else begin // - (mid: neither empty nor full) + if (i_v && o_b) begin // - mid & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= addr+1; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end // else: !if(addr_full) + end // case: state_more + + default: begin + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + end // case: default + + endcase // case(state) + end // always @ * + +endmodule // Q_srl + + +`endif // `ifdef Q_srl diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 4327b2477309eea73851c2263459017cf11f9f70..e58d5fe24f882649dc5b4bcd09dbd39470c7f3d4 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -134,10 +134,14 @@ wire [31:0] config_q0; //multiple wire AXI Streams reg m_axis_0_afull = 0; -reg m_axis_0_tready; +reg m_axis_0_tready_inv; wire m_axis_0_tvalid; wire $WEIGHT_RANGE$ m_axis_0_tdata; +reg m_axis_0_tready_q; +wire m_axis_0_tvalid_q; +wire $WEIGHT_RANGE$ m_axis_0_tdata_q; + reg m_axis_1_afull = 0; reg m_axis_1_tready = 1; wire m_axis_1_tvalid; @@ -213,7 +217,7 @@ mem //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits .m_axis_0_afull(m_axis_0_afull), -.m_axis_0_tready(m_axis_0_tready), +.m_axis_0_tready(!m_axis_0_tready_inv), .m_axis_0_tvalid(m_axis_0_tvalid), .m_axis_0_tdata(m_axis_0_tdata), @@ -245,6 +249,23 @@ mem ); +// weight streamer FIFO +Q_srl #( +.depth(8), +.width($WEIGHT_WIDTH$) +) +$LAYER_NAME$_w_fifo +( + .clock(ap_clk), + .reset(!ap_rst_n), + .i_d(m_axis_0_tdata), + .i_v(m_axis_0_tvalid), + .i_b(m_axis_0_tready_inv), + .o_d(m_axis_0_tdata_q), + .o_v(m_axis_0_tvalid_q), + .o_b(!m_axis_0_tready_q) +); + //MVA_Stream_Unit $LAYER_NAME$ @@ -255,9 +276,9 @@ MVA_Stream_U .in0_V_V_TDATA(in0_V_V_TDATA), //$IN_RANGE$ input .in0_V_V_TVALID(in0_V_V_TVALID), //input .in0_V_V_TREADY(in0_V_V_TREADY), //output -.weights_V_V_TDATA(m_axis_0_tdata), //$WEIGHT_RANGE$ input -.weights_V_V_TVALID(m_axis_0_tvalid), //input -.weights_V_V_TREADY(m_axis_0_tready), //output +.weights_V_V_TDATA(m_axis_0_tdata_q), //$WEIGHT_RANGE$ input +.weights_V_V_TVALID(m_axis_0_tvalid_q), //input +.weights_V_V_TREADY(m_axis_0_tready_q), //output .out_V_V_TDATA(out_V_V_TDATA), //$OUT_RANGE$ output .out_V_V_TVALID(out_V_V_TVALID), //output .out_V_V_TREADY(out_V_V_TREADY) //input