diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
new file mode 100644
index 0000000000000000000000000000000000000000..13bd2b6ead805a62b0ef13f6f94a9b28665f985f
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -0,0 +1,320 @@
+// original source:
+// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v
+
+
+// Copyright (c) 1999 The Regents of the University of California
+// Copyright (c) 2010 The Regents of the University of Pennsylvania
+// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London
+// Copyright (c) 2020 Xilinx
+//
+// Permission to use, copy, modify, and distribute this software and
+// its documentation for any purpose, without fee, and without a
+// written agreement is hereby granted, provided that the above copyright
+// notice and this paragraph and the following two paragraphs appear in
+// all copies.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
+// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON
+// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
+// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+//
+
+// Q_srl_oreg3_prefull_SIMPLE.v
+//
+//  - In-page queue with parameterizable depth, bit width
+//  - Stream I/O is triple (data, valid, back-pressure),
+//      with EOS concatenated into the data
+//  - Flow control for input & output is combinationally decoupled
+//  - 2 <= depth <= 256
+//      * (depth >= 2)  is required to decouple I/O flow control,
+//          where empty => no produce,  full => no consume,
+//          and depth 1 would ping-pong between the two at half rate
+//      * (depth <= 256) can be modified
+//           by changing ''synthesis loop_limit X'' below
+//          and changing ''addrwidth'' or its log computation
+//  - 1 <= width
+//  - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice,
+//      plus output register (for fast output)
+//  - Queue addressing is done by ''addr'' up-down counter
+//  - Queue fullness is checked by comparator (addr==depth)
+//  - Queue fullness                           is pre-computed for next cycle
+//  - Queue input back-pressure                is pre-computed for next cycle
+//  - Queue output valid (state!=state__empty) is pre-computed for next cycle
+//      (necessary since SRL data output reg requires non-boolean state)
+//  - FSM has 3 states (empty, one, more)
+//  - When empty, continue to emit most recently emitted value (for debugging)
+//
+//  - Queue slots used      = / (state==state_empty) ? 0
+//                            | (state==state_one)   ? 1
+//                            \ (state==state_more)  ? addr+2
+//  - Queue slots used     <=  depth
+//  - Queue slots remaining =  depth - used
+//                          = / (state==state_empty) ? depth
+//                            | (state==state_one)   ? depth-1
+//                            \ (state==state_more)  ? depth-2-addr
+//
+//  - Synplify 7.1 / 8.0
+//  - Eylon Caspi,  9/11/03, 8/18/04, 3/29/05
+
+
+`ifdef  Q_srl
+`else
+`define Q_srl
+
+
+module Q_srl (clock, reset, i_d, i_v, i_b, o_d, o_v, o_b, count);
+
+   parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
+   parameter width = 16;   // - width of data (i_d, o_d)
+
+   `define LOG2 (  (((depth))     ==0) ? 0	/* - depth==0   LOG2=0 */ \
+		 : (((depth-1)>>0)==0) ? 0	/* - depth<=1   LOG2=0 */ \
+		 : (((depth-1)>>1)==0) ? 1	/* - depth<=2   LOG2=1 */ \
+		 : (((depth-1)>>2)==0) ? 2	/* - depth<=4   LOG2=2 */ \
+		 : (((depth-1)>>3)==0) ? 3	/* - depth<=8   LOG2=3 */ \
+		 : (((depth-1)>>4)==0) ? 4	/* - depth<=16  LOG2=4 */ \
+		 : (((depth-1)>>5)==0) ? 5	/* - depth<=32  LOG2=5 */ \
+		 : (((depth-1)>>6)==0) ? 6	/* - depth<=64  LOG2=6 */ \
+		 : (((depth-1)>>7)==0) ? 7	/* - depth<=128 LOG2=7 */ \
+		 :                       8)	/* - depth<=256 LOG2=8 */
+
+// parameter addrwidth = LOG2;			// - width of queue addr
+
+   parameter addrwidth =
+		(  (((depth))     ==0) ? 0	// - depth==0   LOG2=0
+		 : (((depth-1)>>0)==0) ? 0	// - depth<=1   LOG2=0
+		 : (((depth-1)>>1)==0) ? 1	// - depth<=2   LOG2=1
+		 : (((depth-1)>>2)==0) ? 2	// - depth<=4   LOG2=2
+		 : (((depth-1)>>3)==0) ? 3	// - depth<=8   LOG2=3
+		 : (((depth-1)>>4)==0) ? 4	// - depth<=16  LOG2=4
+		 : (((depth-1)>>5)==0) ? 5	// - depth<=32  LOG2=5
+		 : (((depth-1)>>6)==0) ? 6	// - depth<=64  LOG2=6
+		 : (((depth-1)>>7)==0) ? 7	// - depth<=128 LOG2=7
+		 :                       8)	// - depth<=256 LOG2=8
+		 ;
+
+   input     clock;
+   input     reset;
+
+   input  [width-1:0] i_d;	// - input  stream data (concat data + eos)
+   input              i_v;	// - input  stream valid
+   output             i_b;	// - input  stream back-pressure
+
+   output [width-1:0] o_d;	// - output stream data (concat data + eos)
+   output             o_v;	// - output stream valid
+   input              o_b;	// - output stream back-pressure
+
+   output [addrwidth:0] count;  // - output number of elems in queue
+
+   reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
+							//     for data output
+   reg 			  shift_en_;			// - SRL16 shift enable
+   reg    [width-1:0] 	  srl [depth-2:0];		// - SRL16 memory
+   reg 			  shift_en_o_;			// - SRLO  shift enable
+   reg    [width-1:0] 	  srlo_, srlo			// - SRLO  output reg
+			  /* synthesis syn_allow_retiming=0 */ ;
+
+   parameter state_empty = 2'd0;    // - state empty : o_v=0 o_d=UNDEFINED
+   parameter state_one   = 2'd1;    // - state one   : o_v=1 o_d=srlo
+   parameter state_more  = 2'd2;    // - state more  : o_v=1 o_d=srlo
+				    //     #items in srl = addr+2
+
+   reg [1:0] state, state_;	    // - state register
+
+   wire      addr_full_;	    // - true iff addr==depth-2 on NEXT cycle
+   reg       addr_full; 	    // - true iff addr==depth-2
+   wire      addr_zero_;	    // - true iff addr==0
+   wire      o_v_reg_;		    // - true iff state_empty   on NEXT cycle
+   reg       o_v_reg  		    // - true iff state_empty
+	     /* synthesis syn_allow_retiming=0 */ ;
+   wire      i_b_reg_;		    // - true iff !full         on NEXT cycle
+   reg       i_b_reg  		    // - true iff !full
+	     /* synthesis syn_allow_retiming=0 */ ;
+
+   assign addr_full_ = (state_==state_more) && (addr_==depth-2);
+						// - queue full
+   assign addr_zero_ = (addr==0);		// - queue contains 2 (or 1,0)
+   assign o_v_reg_   = (state_!=state_empty);	// - output valid if non-empty
+   assign i_b_reg_   = addr_full_;		// - input bp if full
+   assign o_d = srlo;				// - output data from queue
+   assign o_v = o_v_reg;			// - output valid if non-empty
+   assign i_b = i_b_reg;			// - input bp if full
+
+   assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0));
+
+   // - ''always'' block with both FFs and SRL16 does not work,
+   //      since FFs need reset but SRL16 does not
+
+   always @(posedge clock) begin	// - seq always: FFs
+      if (reset) begin
+	 state     <= state_empty;
+	 addr      <= 0;
+         addr_full <= 0;
+	 o_v_reg   <= 0;
+	 i_b_reg   <= 1;
+      end
+      else begin
+	 state     <= state_;
+	 addr      <= addr_;
+         addr_full <= addr_full_;
+	 o_v_reg   <= o_v_reg_;
+	 i_b_reg   <= i_b_reg_;
+      end
+   end // always @ (posedge clock)
+
+   always @(posedge clock) begin	// - seq always: srlo
+      // - infer enabled output reg at end of shift chain
+      // - input first element from i_d, all subsequent elements from SRL16
+      if (reset) begin
+	 srlo <= 0;
+      end
+      else begin
+	 if (shift_en_o_) begin
+	    srlo <= srlo_;
+	 end
+      end
+   end // always @ (posedge clock)
+
+   always @(posedge clock) begin			// - seq always: srl
+      // - infer enabled SRL16E from shifting srl array
+      // - no reset capability;  srl[] contents undefined on reset
+      if (shift_en_) begin
+	 // synthesis loop_limit 256
+	 for (a_=depth-2; a_>0; a_=a_-1) begin
+	    srl[a_] <= srl[a_-1];
+	 end
+	 srl[0] <= i_d;
+      end
+   end // always @ (posedge clock or negedge reset)
+
+   always @* begin					// - combi always
+        srlo_       <=  'bx;
+        shift_en_o_ <= 1'bx;
+        shift_en_   <= 1'bx;
+        addr_       <=  'bx;
+        state_      <= 2'bx;
+      case (state)
+
+	state_empty: begin		    // - (empty, will not produce)
+	      if (i_v) begin		    // - empty & i_v => consume
+		 srlo_       <= i_d;
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_one;
+	      end
+	      else	begin		    // - empty & !i_v => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_empty;
+	      end
+	end
+
+	state_one: begin		    // - (contains one)
+	      if (i_v && o_b) begin	    // - one & i_v & o_b => consume
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1;
+		 addr_       <= 0;
+		 state_      <= state_more;
+	      end
+	      else if (i_v && !o_b) begin   // - one & i_v & !o_b => cons+prod
+		 srlo_       <= i_d;
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 1;
+		 addr_       <= 0;
+		 state_      <= state_one;
+	      end
+	      else if (!i_v && o_b) begin   // - one & !i_v & o_b => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_one;
+	      end
+	      else if (!i_v && !o_b) begin  // - one & !i_v & !o_b => produce
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_empty;
+	      end
+	end // case: state_one
+
+	state_more: begin		    // - (contains more than one)
+	   if (addr_full || (depth==2)) begin
+					    // - (full, will not consume)
+					    // - (full here if depth==2)
+	      if (o_b) begin		    // - full & o_b => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 0;
+		 addr_       <= addr;
+		 state_      <= state_more;
+	      end
+	      else begin		    // - full & !o_b => produce
+		 srlo_       <= srl[addr];
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 0;
+//		 addr_       <= addr-1;
+//		 state_      <= state_more;
+		 addr_       <= addr_zero_ ? 0         : addr-1;
+		 state_      <= addr_zero_ ? state_one : state_more;
+	      end
+	   end
+	   else begin			    // - (mid: neither empty nor full)
+	      if (i_v && o_b) begin	    // - mid & i_v & o_b => consume
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1;
+		 addr_       <= addr+1;
+		 state_      <= state_more;
+	      end
+	      else if (i_v && !o_b) begin   // - mid & i_v & !o_b => cons+prod
+		 srlo_       <= srl[addr];
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 1;
+		 addr_       <= addr;
+		 state_      <= state_more;
+	      end
+	      else if (!i_v && o_b) begin   // - mid & !i_v & o_b => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 0;
+		 addr_       <= addr;
+		 state_      <= state_more;
+	      end
+	      else if (!i_v && !o_b) begin  // - mid & !i_v & !o_b => produce
+		 srlo_       <= srl[addr];
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 0;
+		 addr_       <= addr_zero_ ? 0         : addr-1;
+		 state_      <= addr_zero_ ? state_one : state_more;
+	      end
+	   end // else: !if(addr_full)
+	end // case: state_more
+
+	default: begin
+		 srlo_       <=  'bx;
+		 shift_en_o_ <= 1'bx;
+		 shift_en_   <= 1'bx;
+		 addr_       <=  'bx;
+		 state_      <= 2'bx;
+	end // case: default
+
+      endcase // case(state)
+   end // always @ *
+
+endmodule // Q_srl
+
+
+`endif  // `ifdef  Q_srl
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 4327b2477309eea73851c2263459017cf11f9f70..e58d5fe24f882649dc5b4bcd09dbd39470c7f3d4 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -134,10 +134,14 @@ wire [31:0] config_q0;
 
 //multiple wire AXI Streams
 reg m_axis_0_afull = 0;
-reg m_axis_0_tready;
+reg m_axis_0_tready_inv;
 wire m_axis_0_tvalid;
 wire $WEIGHT_RANGE$ m_axis_0_tdata;
 
+reg m_axis_0_tready_q;
+wire m_axis_0_tvalid_q;
+wire $WEIGHT_RANGE$ m_axis_0_tdata_q;
+
 reg m_axis_1_afull = 0;
 reg m_axis_1_tready = 1;
 wire m_axis_1_tvalid;
@@ -213,7 +217,7 @@ mem
 
 //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
 .m_axis_0_afull(m_axis_0_afull),
-.m_axis_0_tready(m_axis_0_tready),
+.m_axis_0_tready(!m_axis_0_tready_inv),
 .m_axis_0_tvalid(m_axis_0_tvalid),
 .m_axis_0_tdata(m_axis_0_tdata),
 
@@ -245,6 +249,23 @@ mem
 
 );
 
+// weight streamer FIFO
+Q_srl #(
+.depth(8),
+.width($WEIGHT_WIDTH$)
+)
+$LAYER_NAME$_w_fifo
+(
+ .clock(ap_clk),
+ .reset(!ap_rst_n),
+ .i_d(m_axis_0_tdata),
+ .i_v(m_axis_0_tvalid),
+ .i_b(m_axis_0_tready_inv),
+ .o_d(m_axis_0_tdata_q),
+ .o_v(m_axis_0_tvalid_q),
+ .o_b(!m_axis_0_tready_q)
+);
+
 //MVA_Stream_Unit
 
 $LAYER_NAME$
@@ -255,9 +276,9 @@ MVA_Stream_U
 .in0_V_V_TDATA(in0_V_V_TDATA),		//$IN_RANGE$ input
 .in0_V_V_TVALID(in0_V_V_TVALID),  	//input
 .in0_V_V_TREADY(in0_V_V_TREADY),	//output
-.weights_V_V_TDATA(m_axis_0_tdata),	//$WEIGHT_RANGE$ input
-.weights_V_V_TVALID(m_axis_0_tvalid),	//input
-.weights_V_V_TREADY(m_axis_0_tready),	//output
+.weights_V_V_TDATA(m_axis_0_tdata_q),	//$WEIGHT_RANGE$ input
+.weights_V_V_TVALID(m_axis_0_tvalid_q),	//input
+.weights_V_V_TREADY(m_axis_0_tready_q),	//output
 .out_V_V_TDATA(out_V_V_TDATA),		//$OUT_RANGE$ output
 .out_V_V_TVALID(out_V_V_TVALID),	//output
 .out_V_V_TREADY(out_V_V_TREADY)		//input