diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v index 7b207fbd6db7c9d985ba3ed50d7fcd97612e07f5..4ddb7c6f1c62a015e0468cb651f1f4b19944d172 100644 --- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v +++ b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v @@ -37,7 +37,7 @@ module ramb18_wf_dualport ) ( input clk, - + input wea, input [AWIDTH-1:0] addra, input [DWIDTH-1:0] wdataa, @@ -53,7 +53,7 @@ module ramb18_wf_dualport reg [DWIDTH-1:0] rdataa; reg [DWIDTH-1:0] rdatab; -reg [7:0] idx = ID; +reg [15:0] idx; //initialize memory initial begin //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT @@ -63,10 +63,11 @@ initial begin $finish(); end //MEM_INIT path must be terminated by / - if (ID < 10) - $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, 1023); + $sformat(idx,"%0d",ID); + if (ID < 10) + $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, 1023); else - $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, 1023); + $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, 1023); end //memory ports, with output pipeline register @@ -83,4 +84,4 @@ always @(posedge clk) begin rdqb <= rdatab; end -endmodule \ No newline at end of file +endmodule diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 7784024aae102989338df9b040fcfc1f9dc36983..e0c3348cf9926e8a7272688cca29ab18cbf3ad16 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -40,7 +40,10 @@ except ModuleNotFoundError: from onnx import TensorProto, helper from finn.core.datatype import DataType from finn.custom_op.fpgadataflow import HLSCustomOp -from finn.util.basic import interleave_matrix_outer_dim_from_partitions +from finn.util.basic import ( + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, @@ -513,20 +516,23 @@ class StreamingFCLayer_Batch(HLSCustomOp): weight_tensor_unflipped, export_wdt, weight_width, prefix="" ) weight_stream_len = np.prod(weight_tensor_unflipped.shape) - assert ( - weight_stream_len <= 1024 - ), """Decoupled mem mode needs - weight stream length <= 1024 for now""" + factor = math.ceil(weight_stream_len / 1024) # add zeroes to pad out file to 1024 entries weight_stream = weight_tensor_unflipped.flatten() - pad_amt = 1024 - weight_stream_len + pad_amt = (factor * 1024) - weight_stream_len weight_stream = np.pad( weight_stream, (0, pad_amt), mode="constant", constant_values="0" ) weight_stream = weight_stream.copy() - with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f: - for val in weight_stream: + i = 0 + j = 0 + for val in weight_stream: + if i == 1024: + i = 0 + j += 1 + with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f: f.write(val + "\n") + i += 1 else: raise Exception( @@ -997,7 +1003,11 @@ class StreamingFCLayer_Batch(HLSCustomOp): self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)] mw = self.get_nodeattr("MW") mh = self.get_nodeattr("MH") - self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(int(mw * mh))] + depth = int(mw * mh) + self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(depth)] + self.code_gen_dict["$MEM_DEPTH$"] = [ + str(roundup_to_integer_multiple(depth, 1024)) + ] template = self.decoupled_wrapper @@ -1034,9 +1044,11 @@ class StreamingFCLayer_Batch(HLSCustomOp): if file.endswith(".v"): verilog_file = os.path.join(memstream_dir, file) copy(verilog_file, verilog_folder) - # copy .dat file of weights - dat_file = "{}/memblock_0.dat".format(code_gen_dir) - copy(dat_file, verilog_folder) + # copy .dat files of weights + for file in os.listdir(code_gen_dir): + if file.endswith(".dat"): + dat_file = os.path.join(code_gen_dir, file) + copy(dat_file, verilog_folder) # copy verilog wrapper verilog_wrapper = "{}/{}_memstream.v".format( code_gen_dir, self.onnx_node.name diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 5323aac2e344fb8b3c1166e695753e68a435b08f..3e5205d9e8fc1abd5938f8a3dc4df489f81b9eb7 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -183,7 +183,7 @@ memstream // memory, set per-stream offsets in memory, set per-stream widths .CONFIG_EN(1), .NSTREAMS(1), -.MEM_DEPTH(1024), +.MEM_DEPTH($MEM_DEPTH$), .MEM_WIDTH($WEIGHT_WIDTH$), .MEM_INIT("./"), diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py index 80c9e84ba92c93e8a5d57ffaceb22b5abf188963..330d9a1aedcd6607ae0c150b0ea9ef439afcc1df 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py +++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py @@ -300,3 +300,90 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): hls_synt_res_est = model.analysis(hls_synth_res_estimation) assert "StreamingFCLayer_Batch_0" in hls_synt_res_est + +# mem_mode: const or decoupled +@pytest.mark.parametrize("mem_mode", ["decoupled"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType.INT4]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType.INT4]) +# input datatype +@pytest.mark.parametrize("idt", [DataType.INT4]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [-1]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [128]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [128]) +def test_fpgadataflow_fclayer_large_depth_decoupled_mode(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + # generate input data + x = gen_finn_dt_tensor(idt, (1, mw)) + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: + odt = DataType.UINT32 + else: + odt = DataType.INT32 + else: + odt = act + (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: + tdt = DataType.UINT32 + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType.INT32 + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + for node in model.graph.node: + # lookup op_type in registry of CustomOps + inst = getCustomOp(node) + inst.set_nodeattr("mem_mode", mem_mode) + + # prepare input data + input_dict = prepare_inputs(x, idt, wdt) + if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: + # convert inputs to binary and use xnorpopcountmatmul + y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) + else: + y = np.matmul(x, W) + if T is not None: + y = multithreshold(y, T) + if act == DataType.BIPOLAR: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + # TODO split up into several dependent tests -- need to check how this + # works for parametrized tests... + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5)) + model = model.transform(HLSSynth_IPGen()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" + + hls_synt_res_est = model.analysis(hls_synth_res_estimation) + assert "StreamingFCLayer_Batch_0" in hls_synt_res_est