From a976e5bc8a9d360485f8a7e2077dc39a58368c73 Mon Sep 17 00:00:00 2001
From: Lucian Petrica <lucianp@xilinx.com>
Date: Fri, 23 Jul 2021 11:58:47 +0100
Subject: [PATCH] Removed template for streaming thresholds (moved to hlslib);
 added h/w dimensions for imdim (non-square)

---
 src/finn/custom_op/fpgadataflow/templates.py  | 53 -------------------
 .../fpgadataflow/thresholding_batch.py        | 22 ++++----
 2 files changed, 11 insertions(+), 64 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 40221ce3b..7f8dbc787 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -355,56 +355,3 @@ $LAYER_NAME$
 
 endmodule
 """
-
-decoupled_thresholding_template = """
-template <
-    unsigned ImgDim, unsigned NumChannels, unsigned PE,
-    typename TSrcI = Identity, typename TDstI = Identity,
-    int ActVal=0, typename TT, unsigned int NumSteps,
-    typename TI, typename TO>
-void Thresholding_Stream_Batch(hls::stream<TI> &in,
-                        hls::stream<TO> &out,
-                        hls::stream<ap_uint<PE*NumSteps*TT::width>> &weight,
-                        int const reps)
-{
-
-  // how many different rows each neuron will compute
-  // alternatively: number of vertical matrix chunks
-  unsigned const NF = NumChannels / PE;
-
-  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, comp::less_equal<TT>> internal_thr;
-  #pragma HLS ARRAY_PARTITION variable=internal_thr.m_thresholds complete dim=0
-
-  // everything merged into a common iteration space (one "big" loop instead
-  // of smaller nested loops) to get the pipelinening the way we want
-  for (unsigned i = 0; i < reps * ImgDim * ImgDim * NF; i++)
-  {
-    #pragma HLS PIPELINE II=1
-
-    ap_uint<PE*NumSteps*TT::width> packed_thr;
-    packed_thr = weight.read();
-    // slicer to get 1 PE's worth of thresholds
-    auto const pe_slicer = Slice<ap_uint<NumSteps*TT::width>>()(packed_thr);
-
-    TI inElem;
-    inElem = in.read();
-    auto outElem = TDstI().template operator()<TO>();
-
-    for (unsigned pe = 0; pe < PE; pe++)
-    {
-#pragma HLS UNROLL
-      // slicer to get individual thresholds
-      auto const thr_slicer = Slice<TT>()(pe_slicer(pe, 0));
-      for (unsigned nt = 0; nt < NumSteps; nt++)
-      {
-      #pragma HLS UNROLL
-        internal_thr.m_thresholds[pe][0][nt] = thr_slicer(nt, 0);
-      }
-
-      auto const act = TSrcI()(inElem);
-      outElem(pe,0,1) = internal_thr.activate(0, pe, act(pe,0));
-    }
-    out.write(outElem);
-  }
-}
-"""
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 2944aeaa3..b4dc32943 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -621,10 +621,6 @@ class Thresholding_Batch(HLSCustomOp):
             self.code_gen_dict["$DEFINES$"].append(
                 "#define NumSteps1 %d" % self.get_nodeattr("numSteps")
             )
-            # TODO remove once Thresholding_Stream_Batch is in hlslib:
-            self.code_gen_dict["$DEFINES$"].append(
-                templates.decoupled_thresholding_template
-            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -679,28 +675,32 @@ class Thresholding_Batch(HLSCustomOp):
         node = self.onnx_node
         ishape = self.get_folded_input_shape()
         if len(ishape) == 3:
-            imgdim = 1
+            imgdimh = 1
+            imgdimw = 1
         elif len(ishape) == 5:
-            imgdim = ishape[1]
+            imgdimh = ishape[1]
+            imgdimw = ishape[2]
         else:
-            raise Exception("""Unexpeted input shape""")
+            raise Exception("""Unexpected input shape""")
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}>
+                """{}<{}, {}, NumChannels1, PE1, {}, {}>
                 (in0, out, threshs, numReps);""".format(
                     node.op_type,
-                    imgdim,
+                    imgdimh,
+                    imgdimw,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
             ]
         elif mem_mode == "decoupled":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                """{}<{}, {}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
                 (in0, out, weights, numReps);""".format(
                     "Thresholding_Stream_Batch",
-                    imgdim,
+                    imgdimh,
+                    imgdimw,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
-- 
GitLab