diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index bf1ad4f62d00a7658051be71b415e20bacfcafb1..c7c418482dbdfd6f4e6539a9eaf6ec4b9ff1e24b 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -86,10 +86,10 @@ RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg # git-based Python repo dependencies # these are installed in editable mode for easier co-development -ARG FINN_BASE_COMMIT="ac0b86a63eb937b869bfa453a996a8a8b8506546" +ARG FINN_BASE_COMMIT="d38426634f1cfbc5432a0e52c3f65c07fad12aa4" ARG FINN_EXP_COMMIT="f82c0d9868bb88ea045dfadb28508d327d287221" ARG BREVITAS_COMMIT="462f86cdc60f9915baf13afd1676fb21da44c2ee" -ARG PYVERILATOR_COMMIT="e2ff74030de3992dcac54bf1b6aad2915946e8cb" +ARG PYVERILATOR_COMMIT="0c3eb9343500fc1352a02c020a736c8c2db47e8e" ARG CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" ARG HLSLIB_COMMIT="fbb07135b3d991602e8abe3f2c51212c11fd392b" ARG OMX_COMMIT="1dfc4aa2f2895632742cd5751520c6b472feb74e" diff --git a/run-docker.sh b/run-docker.sh index 19a66f9e95c57be8ef332a075ff69c000488fdce..a1147fcee55d345850da4c533dd9e88270d727a6 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -118,14 +118,14 @@ elif [ "$1" = "notebook" ]; then DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT " elif [ "$1" = "build_dataflow" ]; then BUILD_DATAFLOW_DIR=$(readlink -f "$2") - DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR" + DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR " DOCKER_INTERACTIVE="-it" #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build gecho "Running build_dataflow for folder $BUILD_DATAFLOW_DIR" DOCKER_CMD="build_dataflow $BUILD_DATAFLOW_DIR" elif [ "$1" = "build_custom" ]; then BUILD_CUSTOM_DIR=$(readlink -f "$2") - DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR" + DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR " DOCKER_INTERACTIVE="-it" #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build gecho "Running build_custom: $BUILD_CUSTOM_DIR/build.py" @@ -141,7 +141,7 @@ if [ "$FINN_DOCKER_GPU" != 0 ];then if [ ! -z "$NVIDIA_VISIBLE_DEVICES" ];then DOCKER_EXTRA+="--runtime nvidia -e NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES " else - DOCKER_EXTRA+="--gpus all" + DOCKER_EXTRA+="--gpus all " fi fi diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 4aa1ad31e1ad73762ef46cc861b1a255ce57b926..c4664a5471984e1f88a70f1d9bb6ce674e38c782 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -62,7 +62,7 @@ class StreamToLogger(object): pass -def resolve_build_steps(cfg: DataflowBuildConfig): +def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True): steps = cfg.steps if steps is None: steps = default_build_dataflow_steps @@ -76,19 +76,56 @@ def resolve_build_steps(cfg: DataflowBuildConfig): steps_as_fxns.append(transform_step) else: raise Exception("Could not resolve build step: " + str(transform_step)) + if partial: + step_names = list(map(lambda x: x.__name__, steps_as_fxns)) + if cfg.start_step is None: + start_ind = 0 + else: + start_ind = step_names.index(cfg.start_step) + if cfg.stop_step is None: + stop_ind = len(step_names) - 1 + else: + stop_ind = step_names.index(cfg.stop_step) + steps_as_fxns = steps_as_fxns[start_ind : (stop_ind + 1)] + return steps_as_fxns +def resolve_step_filename( + step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0 +): + step_names = list( + map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False)) + ) + assert step_name in step_names, "start_step %s not found" + step_name + step_no = step_names.index(step_name) + step_delta + assert step_no >= 0, "Invalid step+delta combination" + assert step_no < len(step_names), "Invalid step+delta combination" + filename = cfg.output_dir + "/intermediate_models/" + filename += "%s.onnx" % (step_names[step_no]) + return filename + + def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): """Best-effort build a dataflow accelerator using the given configuration. :param model_filename: ONNX model filename to build :param cfg: Build configuration """ - model = ModelWrapper(model_filename) + # if start_step is specified, override the input model + if cfg.start_step is None: + print("Building dataflow accelerator from " + model_filename) + model = ModelWrapper(model_filename) + else: + intermediate_model_filename = resolve_step_filename(cfg.start_step, cfg, -1) + print( + "Building dataflow accelerator from intermediate checkpoint" + + intermediate_model_filename + ) + model = ModelWrapper(intermediate_model_filename) assert type(model) is ModelWrapper finn_build_dir = os.environ["FINN_BUILD_DIR"] - print("Building dataflow accelerator from " + model_filename) + print("Intermediate outputs will be generated in " + finn_build_dir) print("Final outputs will be generated in " + cfg.output_dir) print("Build log is at " + cfg.output_dir + "/build_dataflow.log") @@ -132,7 +169,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): sys.stdout = stdout_orig sys.stderr = stderr_orig time_per_step[step_name] = step_end - step_start - chkpt_name = "%d_%s.onnx" % (step_num, step_name) + chkpt_name = "%s.onnx" % (step_name) if cfg.save_intermediate_models: intermediate_model_dir = cfg.output_dir + "/intermediate_models" if not os.path.exists(intermediate_model_dir): diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 4a112699ec9bdf126f447fe2244eb01f6f4fa042..052a6c701a639929f9c2dff682c2a8777b679788 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -172,6 +172,13 @@ class DataflowBuildConfig: #: that will override the target_fps setting here. target_fps: Optional[int] = None + #: (Optional) Use two-pass relaxation for folding, only relevant if target_fps + #: is set. If enabled, parallelization will internally run a second time if the + #: target cycles from the first pass could not be achieved, instead using the + #: achievable target to obtain a balanced pipeline. If disabled, this can be + #: useful for decreasing the latency (even though throughput won't increase). + folding_two_pass_relaxation: Optional[bool] = True + #: (Optional) At which steps the generated intermediate output model #: will be verified. See documentation of VerificationStepType for #: available options. @@ -185,6 +192,19 @@ class DataflowBuildConfig: #: verification. Only required if verify_steps is not empty. verify_expected_output_npy: Optional[str] = "expected_output.npy" + #: (Optional) Save full execution context for each of the verify_steps. + #: By default, only the top-level graph output is saved. + verify_save_full_context: Optional[bool] = False + + #: (Optional) Save .vcd waveforms from rtlsim under reports. + #: By default, waveforms won't be saved. + verify_save_rtlsim_waveforms: Optional[bool] = False + + #: (Optional) Run synthesis to generate a .dcp for the stitched-IP output product. + #: This can make it easier to treat it as a standalone artifact without requiring + #: the full list of layer IP build directories. By default, synthesis will not run. + stitched_ip_gen_dcp: Optional[bool] = False + #: (Optional) Control the maximum width of the per-PE MVAU stream while #: exploring the parallelization attributes to reach target_fps #: Only relevant if target_fps is specified. @@ -264,6 +284,13 @@ class DataflowBuildConfig: #: - functions are called with (model, DataflowBuildConfig) as args steps: Optional[List[Any]] = None + #: If given, start from this step, loading the intermediate model generated + #: from the previous step (save_intermediate_models must be enabled) + start_step: Optional[str] = None + + #: If given, stop at this step. + stop_step: Optional[str] = None + def _resolve_hls_clk_period(self): if self.hls_clk_period_ns is None: # use same clk for synth and hls if not explicitly specified @@ -333,4 +360,7 @@ class DataflowBuildConfig: + self.verify_expected_output_npy ) verify_expected_output_npy = np.load(self.verify_expected_output_npy) - return (verify_input_npy, verify_expected_output_npy) + return ( + verify_input_npy.astype(np.float32), + verify_expected_output_npy.astype(np.float32), + ) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 5bdccebb58ccb6f4906a05dda58da2494366739f..b9c065ed2514cbbf9f92391ce496705aa3d4a822 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -30,7 +30,8 @@ import json import numpy as np import os from copy import deepcopy -from shutil import copy, copytree +from distutils.dir_util import copy_tree +from shutil import copy import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls import finn.transformation.streamline.absorb as absorb @@ -70,7 +71,6 @@ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) @@ -97,6 +97,7 @@ from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MakeMaxPoolNHWC from finn.util.config import extract_model_config_to_json +from finn.util.pyverilator import pyverilate_get_liveness_threshold_cycles from finn.util.test import execute_parent @@ -115,21 +116,77 @@ def verify_step( parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name model.save(child_model_fn) - out_npy = execute_parent(parent_model_fn, child_model_fn, in_npy) + out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name + out_dict = execute_parent( + parent_model_fn, child_model_fn, in_npy, return_full_ctx=True + ) + out_npy = out_dict[out_tensor_name] else: inp_tensor_name = model.graph.input[0].name out_tensor_name = model.graph.output[0].name inp_dict = {inp_tensor_name: in_npy} - out_dict = execute_onnx(model, inp_dict) + out_dict = execute_onnx(model, inp_dict, True) out_npy = out_dict[out_tensor_name] res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all() res_to_str = {True: "SUCCESS", False: "FAIL"} res_str = res_to_str[res] - verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (step_name, res_str) - np.save(verification_output_fn, out_npy) + if cfg.verify_save_full_context: + verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % ( + step_name, + res_str, + ) + np.savez(verification_output_fn, **out_dict) + else: + verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % ( + step_name, + res_str, + ) + np.save(verification_output_fn, out_npy) print("Verification for %s : %s" % (step_name, res_str)) +def prepare_for_stitched_ip_rtlsim(verify_model, cfg): + need_restitch = False + # rtlsim only supports certain impl_style for some nodes + # StreamingFIFO must have impl_style=rtl + for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): + inst = getCustomOp(fifo_layer) + if inst.get_nodeattr("impl_style") != "rtl": + inst.set_nodeattr("impl_style", "rtl") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # StreamingDataWidthConverter must have impl_style=hls + for dwc_layer in verify_model.get_nodes_by_op_type( + "StreamingDataWidthConverter_Batch" + ): + inst = getCustomOp(dwc_layer) + if inst.get_nodeattr("impl_style") != "hls": + inst.set_nodeattr("impl_style", "hls") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # if we've made alterations to the model, need to do some re-prep + if need_restitch: + print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") + verify_model = verify_model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) + ) + verify_model = verify_model.transform(HLSSynthIP()) + verify_model = verify_model.transform( + CreateStitchedIP( + cfg._resolve_fpga_part(), + cfg.synth_clk_period_ns, + vitis=False, + ) + ) + # set top-level prop for stitched-ip rtlsim and launch + verify_model.set_metadata_prop("exec_mode", "rtlsim") + # TODO make configurable + # verify_model.set_metadata_prop("rtlsim_trace", "trace.vcd") + return verify_model + + def step_tidy_up(model: ModelWrapper, cfg: DataflowBuildConfig): """Run the tidy-up step on given model. This includes shape and datatype inference, constant folding, and giving nodes and tensors better names. @@ -164,6 +221,7 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(MakeMaxPoolNHWC()) model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) model = model.transform(MakeMaxPoolNHWC()) + model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) # absorb final add-mul nodes into TopK @@ -212,7 +270,12 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig nodes, which point to a separate ONNX file. Dataflow accelerator synthesis can only be performed on those HLSCustomOp sub-graphs.""" - parent_model = model.transform(CreateDataflowPartition()) + parent_model = model.transform( + CreateDataflowPartition( + partition_model_dir=cfg.output_dir + + "/intermediate_models/supported_op_partitions" + ) + ) sdp_nodes = parent_model.get_nodes_by_op_type("StreamingDataflowPartition") assert len(sdp_nodes) == 1, "Only a single StreamingDataflowPartition supported." sdp_node = sdp_nodes[0] @@ -231,7 +294,11 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi target_cycles_per_frame = cfg._resolve_cycles_per_frame() if target_cycles_per_frame is not None: model = model.transform( - SetFolding(target_cycles_per_frame, mvau_wwidth_max=cfg.mvau_wwidth_max) + SetFolding( + target_cycles_per_frame, + mvau_wwidth_max=cfg.mvau_wwidth_max, + two_pass_relaxation=cfg.folding_two_pass_relaxation, + ) ) return model @@ -380,25 +447,35 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig): if DataflowOutputType.STITCHED_IP in cfg.generate_outputs: stitched_ip_dir = cfg.output_dir + "/stitched_ip" model = model.transform( - CreateStitchedIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns) + CreateStitchedIP( + cfg._resolve_fpga_part(), + cfg.synth_clk_period_ns, + vitis=cfg.stitched_ip_gen_dcp, + ) ) # TODO copy all ip sources into output dir? as zip? - copytree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir) + copy_tree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir) print("Vivado stitched IP written into " + stitched_ip_dir) if VerificationStepType.STITCHED_IP_RTLSIM in cfg._resolve_verification_steps(): # prepare ip-stitched rtlsim verify_model = deepcopy(model) - # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that - for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): - getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") - # similarly for StreamingDataWidthConverter with impl_style=hls - for dwc_layer in verify_model.get_nodes_by_op_type( - "StreamingDataWidthConverter_Batch" - ): - getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls") - verify_model = verify_model.transform(PrepareRTLSim()) - verify_model.set_metadata_prop("exec_mode", "rtlsim") + verify_model = prepare_for_stitched_ip_rtlsim(verify_model, cfg) + # use critical path estimate to set rtlsim liveness threshold + # (very conservative) + verify_model = verify_model.transform(AnnotateCycles()) + estimate_network_performance = verify_model.analysis(dataflow_performance) + prev_liveness = pyverilate_get_liveness_threshold_cycles() + os.environ["LIVENESS_THRESHOLD"] = str( + int(estimate_network_performance["critical_path_cycles"]) + ) + if cfg.verify_save_rtlsim_waveforms: + report_dir = cfg.output_dir + "/report" + os.makedirs(report_dir, exist_ok=True) + verify_model.set_metadata_prop( + "rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir) + ) verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True) + os.environ["LIVENESS_THRESHOLD"] = str(prev_liveness) return model @@ -411,28 +488,20 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi assert ( DataflowOutputType.STITCHED_IP in cfg.generate_outputs ), "rtlsim_perf needs stitched IP" + report_dir = cfg.output_dir + "/report" + os.makedirs(report_dir, exist_ok=True) # prepare ip-stitched rtlsim rtlsim_model = deepcopy(model) - # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that - for fifo_layer in rtlsim_model.get_nodes_by_op_type("StreamingFIFO"): - getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") - # similarly for StreamingDataWidthConverter with impl_style=hls - for dwc_layer in rtlsim_model.get_nodes_by_op_type( - "StreamingDataWidthConverter_Batch" - ): - getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls") - rtlsim_model = rtlsim_model.transform(PrepareRTLSim()) - rtlsim_model.set_metadata_prop("exec_mode", "rtlsim") + rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg) # run with single input to get latency + if cfg.verify_save_rtlsim_waveforms: + rtlsim_model.set_metadata_prop( + "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, 1) + ) + rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"])) rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, 1) - rtlsim_latency = rtlsim_perf_dict["cycles"] - # run with num inputs equal to layers to fill the whole pipeline - # to get the steady-state throughput - rtlsim_bs = len(rtlsim_model.graph.node) - rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs) - rtlsim_perf_dict["latency_cycles"] = rtlsim_latency - report_dir = cfg.output_dir + "/report" - os.makedirs(report_dir, exist_ok=True) + rtlsim_latency_bs1 = rtlsim_perf_dict["cycles"] + rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_bs1 with open(report_dir + "/rtlsim_performance.json", "w") as f: json.dump(rtlsim_perf_dict, f, indent=2) @@ -446,7 +515,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig): if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs: driver_dir = cfg.output_dir + "/driver" model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform())) - copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir) + copy_tree(model.get_metadata_prop("pynq_driver_dir"), driver_dir) print("PYNQ Python driver written into " + driver_dir) return model @@ -487,9 +556,15 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig): os.makedirs(bitfile_dir, exist_ok=True) report_dir = cfg.output_dir + "/report" os.makedirs(report_dir, exist_ok=True) + partition_model_dir = cfg.output_dir + "/intermediate_models/kernel_partitions" if cfg.shell_flow_type == ShellFlowType.VIVADO_ZYNQ: model = model.transform( - ZynqBuild(cfg.board, cfg.synth_clk_period_ns, cfg.enable_hw_debug) + ZynqBuild( + cfg.board, + cfg.synth_clk_period_ns, + cfg.enable_hw_debug, + partition_model_dir=partition_model_dir, + ) ) copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.bit") copy(model.get_metadata_prop("hw_handoff"), bitfile_dir + "/finn-accel.hwh") @@ -513,6 +588,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig): strategy=cfg._resolve_vitis_opt_strategy(), enable_debug=cfg.enable_hw_debug, floorplan_file=cfg.vitis_floorplan_file, + partition_model_dir=partition_model_dir, ) ) copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.xclbin") @@ -535,8 +611,8 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): bitfile_dir = cfg.output_dir + "/bitfile" driver_dir = cfg.output_dir + "/driver" os.makedirs(deploy_dir, exist_ok=True) - copytree(bitfile_dir, deploy_dir + "/bitfile") - copytree(driver_dir, deploy_dir + "/driver") + copy_tree(bitfile_dir, deploy_dir + "/bitfile") + copy_tree(driver_dir, deploy_dir + "/driver") return model diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 320b947d0dd99b564da5775dfc8624993af57de2..a552fd419fb3e88c61bfd9229c24e0b71d470b87 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -52,6 +52,7 @@ from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker +from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch from finn.custom_op.fpgadataflow.vector_vector_activate_batch import ( Vector_Vector_Activate_Batch, ) @@ -79,3 +80,4 @@ custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition +custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py index b1dc02131e45b0a04acb25723e09847ee858ebdc..073d6620ac3c2a4f62ac544e74ecf21b6e36d58f 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -526,15 +526,18 @@ class ChannelwiseOp_Batch(HLSCustomOp): # should ImgDim be defined or just filled in here like we do now? ishape = self.get_folded_input_shape() if len(ishape) == 3: - imgdim = 1 + imgdim_h = 1 + imgdim_w = 1 elif len(ishape) == 5: - imgdim = ishape[1] + imgdim_h = ishape[1] + imgdim_w = ishape[2] else: raise Exception("""Unexpeted input shape""") self.code_gen_dict["$DOCOMPUTE$"] = [ - """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}> + """Thresholding_Batch<{}, {}, NumChannels1, PE1, {}, {}> (in0, out, threshs, numReps);""".format( - imgdim, + imgdim_h, + imgdim_w, tmpl_args["TSrcI"], tmpl_args["TDstI"], ) diff --git a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py index 53446ff1f2aba30e69bf188c1673c738440567fb..cf065cf156abed591e579b3f257e8f442eb3a976 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py +++ b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py @@ -47,6 +47,7 @@ class StreamingDataflowPartition(CustomOp): "partition_id": ("i", False, 0), "device_id": ("i", False, 0), "mem_port": ("s", False, ""), + "instance_name": ("s", False, ""), } def make_shape_compatible_op(self, model): diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 5cf5da1b6d378d3177d0a40017733bd6672c9574..96594d441345332bbe5873570156e07cacbb385d 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -105,6 +105,16 @@ class StreamingFCLayer_Batch(HLSCustomOp): "auto", {"auto", "block", "distributed", "ultra"}, ), + # FPGA resource type for threshold memories (if noActivation is False) + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + "ram_style_thresholds": ( + "s", + False, + "auto", + {"auto", "block", "distributed"}, + ), # (mem_mode = decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -663,7 +673,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix # workaround for vivado_hls threshold bug - if ret[0][0] == 0: + if ret[0][0] == 0 and n_thres_steps == 1: ret = np.copy(ret) ret[0][0] = 1 warnings.warn( @@ -1212,6 +1222,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): def pragmas(self): mem_mode = self.get_nodeattr("mem_mode") + ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") in_fifo_depth = self.get_nodeattr("inFIFODepth") @@ -1270,6 +1281,28 @@ class StreamingFCLayer_Batch(HLSCustomOp): "complete dim=3" ) ) + # add resource pragma for thresholds if set + if ram_style_thresholds == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.m_thresholds " + "core=ROM_2P_LUTRAM" + ) + ) + elif ram_style_thresholds == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.m_thresholds " + "core=ROM_2P_BRAM" + ) + ) + elif ram_style_thresholds == "auto": + # no pragma needed + pass + else: + raise Exception( + "Unrecognized ram_style_thresholds value:" + ram_style_thresholds + ) def code_generation_ipi(self): cmd = [] diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index f6a200a47e9c50999a144229c362ddd70c4b22ec..7fb7634dc22e1d00569e7bb755bf120d6de4f808 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -336,7 +336,7 @@ class Thresholding_Batch(HLSCustomOp): ).all(), "Need int threshold tensor" ret = orig_thres_matrix # workaround for vivado_hls threshold bug - if ret[0][0] == 0: + if ret[0][0] == 0 and n_thres_steps == 1: ret = np.copy(ret) ret[0][0] = 1 warnings.warn( diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py new file mode 100644 index 0000000000000000000000000000000000000000..e8aa09b1c0754b68e37d01551afe90811f22e7cd --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -0,0 +1,296 @@ +import numpy as np +import os +import warnings +from onnx import TensorProto, helper + +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class UpsampleNearestNeighbour_Batch(HLSCustomOp): + """ + Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. + Upsampling is done with the Nearest Neighbour algorithm. + The layer expects square feature maps for the in and output. + """ + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + # Size of the output feature map + "OFMDim": ("i", True, 0), + # Size of the input feature map + "IFMDim": ("i", True, 0), + # Amount of channels of the input feature map + "NumChannels": ("i", True, 0), + # FINN input datatype + "inputDataType": ("s", True, ""), + # Batch size + "numInputVectors": ("i", False, 1), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_exp_cycles(self): + OFMDim = self.get_nodeattr("OFMDim") + batch_size = self.get_nodeattr("numInputVectors") + exp_cycles = OFMDim * OFMDim * batch_size + return int(exp_cycles) + + def get_normal_input_shape(self): + IFMDim = self.get_nodeattr("IFMDim") + num_ch = self.get_nodeattr("NumChannels") + batch = self.get_nodeattr("numInputVectors") + ishape = (batch, IFMDim, IFMDim, num_ch) + return ishape + + def get_normal_output_shape(self): + OFMDim = self.get_nodeattr("OFMDim") + num_ch = self.get_nodeattr("NumChannels") + batch = self.get_nodeattr("numInputVectors") + oshape = (batch, OFMDim, OFMDim, num_ch) + return oshape + + def get_folded_input_shape(self): + normal_ishape = list(self.get_normal_input_shape()) + return tuple(normal_ishape) + + def get_folded_output_shape(self): + normal_oshape = list(self.get_normal_output_shape()) + return tuple(normal_oshape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ( + ishape == exp_ishape + ), "Unexpect input shape for UpsampleNearestNeighbour_Batch." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + return ret + + def get_output_datatype(self): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self): + ibits = self.get_input_datatype().bitwidth() + ifm_ch = self.get_nodeattr("NumChannels") + return ibits * ifm_ch + + def get_outstream_width(self): + obits = self.get_output_datatype().bitwidth() + ifm_ch = self.get_nodeattr("NumChannels") + return obits * ifm_ch + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("IFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + odim = self.get_nodeattr("OFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels, + ap_uint<Input_precision> > (in0, out, numReps);""" + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_hls_type, packed_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py index 4dd5a080e10e4a0ab5bd14381186e19144f6edb3..df1ab15c7892d66a668d82040b0da93366942cb7 100644 --- a/src/finn/qnn-data/templates/driver/driver_base.py +++ b/src/finn/qnn-data/templates/driver/driver_base.py @@ -85,24 +85,27 @@ class FINNExampleOverlay(Overlay): self.platform = platform self.batch_size = batch_size self.fclk_mhz = fclk_mhz - if self.platform == "alveo": - if "input_dma_name" in io_shape_dict.keys(): - self.idma = getattr(self, io_shape_dict["input_dma_name"]) - else: - self.idma = self.idma0 - self.odma = self.odma0 - self.odma_handle = None - elif self.platform == "zynq-iodma": - if "input_dma_name" in io_shape_dict.keys(): - self.idma = getattr(self, io_shape_dict["input_dma_name"]) - else: - self.idma = self.idma0 - self.odma = self.odma0 + self.idma = [] + self.odma = [] + self.odma_handle = [] + if "input_dma_name" in io_shape_dict.keys(): + for idma_name in io_shape_dict["input_dma_name"]: + self.idma.append(getattr(self, idma_name)) + else: + self.idma = [self.idma0] + if "output_dma_name" in io_shape_dict.keys(): + for odma_name in io_shape_dict["output_dma_name"]: + self.odma.append(getattr(self, odma_name)) + if self.platform == "alveo": + self.odma_handle.append(None) + else: + self.odma = [self.odma0] + if self.platform == "alveo": + self.odma_handle.append(None) + if self.platform == "zynq-iodma": # set the clock frequency as specified by user during transformations if self.fclk_mhz > 0: Clocks.fclk0_mhz = self.fclk_mhz - else: - raise ValueError("Supported platforms are zynq-iodma alveo") # load any external + runtime weights self.load_external_weights() self.load_runtime_weights() @@ -204,50 +207,50 @@ class FINNExampleOverlay(Overlay): # run accelerator to flush any stale weights from weight streamer FIFOs self.execute_on_buffers() - @property - def idt(self): - return self._io_shape_dict["idt"] + def idt(self, ind=0): + return self._io_shape_dict["idt"][ind] - @property - def odt(self): - return self._io_shape_dict["odt"] + def odt(self, ind=0): + return self._io_shape_dict["odt"][ind] - @property - def ishape_normal(self): - ret = list(self._io_shape_dict["ishape_normal"]) + def ishape_normal(self, ind=0): + ret = list(self._io_shape_dict["ishape_normal"][ind]) ret[0] = self.batch_size return tuple(ret) - @property - def oshape_normal(self): - ret = list(self._io_shape_dict["oshape_normal"]) + def oshape_normal(self, ind=0): + ret = list(self._io_shape_dict["oshape_normal"][ind]) ret[0] = self.batch_size return tuple(ret) - @property - def ishape_folded(self): - ret = list(self._io_shape_dict["ishape_folded"]) + def ishape_folded(self, ind=0): + ret = list(self._io_shape_dict["ishape_folded"][ind]) ret[0] = self.batch_size return tuple(ret) - @property - def oshape_folded(self): - ret = list(self._io_shape_dict["oshape_folded"]) + def oshape_folded(self, ind=0): + ret = list(self._io_shape_dict["oshape_folded"][ind]) ret[0] = self.batch_size return tuple(ret) - @property - def ishape_packed(self): - ret = list(self._io_shape_dict["ishape_packed"]) + def ishape_packed(self, ind=0): + ret = list(self._io_shape_dict["ishape_packed"][ind]) ret[0] = self.batch_size return tuple(ret) - @property - def oshape_packed(self): - ret = list(self._io_shape_dict["oshape_packed"]) + def oshape_packed(self, ind=0): + ret = list(self._io_shape_dict["oshape_packed"][ind]) ret[0] = self.batch_size return tuple(ret) + @property + def num_inputs(self): + return self._io_shape_dict["num_inputs"] + + @property + def num_outputs(self): + return self._io_shape_dict["num_outputs"] + @property def batch_size(self): return self._batch_size @@ -261,68 +264,72 @@ class FINNExampleOverlay(Overlay): self.ibuf_packed_device = None if self.obuf_packed_device is not None: self.obuf_packed_device = None - if self.platform == "alveo": - self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) - self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8) - else: - self.ibuf_packed_device = allocate( - shape=self.ishape_packed, dtype=np.uint8, cacheable=True + cacheable = {"alveo": False, "zynq-iodma": True}[self.platform] + self.ibuf_packed_device = [] + self.obuf_packed_device = [] + self.obuf_packed = [] + for i in range(self.num_inputs): + new_packed_ibuf = allocate( + shape=self.ishape_packed(i), dtype=np.uint8, cacheable=cacheable ) - self.obuf_packed_device = allocate( - shape=self.oshape_packed, dtype=np.uint8, cacheable=True + self.ibuf_packed_device.append(new_packed_ibuf) + for o in range(self.num_outputs): + new_packed_obuf = allocate( + shape=self.oshape_packed(o), dtype=np.uint8, cacheable=cacheable ) - self.obuf_packed = np.empty_like(self.obuf_packed_device) + self.obuf_packed_device.append(new_packed_obuf) + self.obuf_packed.append(np.empty_like(new_packed_obuf)) - def fold_input(self, ibuf_normal): + def fold_input(self, ibuf_normal, ind=0): """Reshapes input in desired shape. Gets input data (ibuf_normal), checks if data is in expected normal shape. Returns folded input.""" # ensure that shape is as expected - assert ibuf_normal.shape == self.ishape_normal + assert ibuf_normal.shape == self.ishape_normal(ind) # convert to folded form - ibuf_folded = ibuf_normal.reshape(self.ishape_folded) + ibuf_folded = ibuf_normal.reshape(self.ishape_folded(ind)) return ibuf_folded - def pack_input(self, ibuf_folded): + def pack_input(self, ibuf_folded, ind=0): """Packs folded input and reverses both SIMD dim and endianness. Gets input data in folded shape and returns packed input data.""" ibuf_packed = finnpy_to_packed_bytearray( ibuf_folded, - self.idt, + self.idt(ind), reverse_endian=True, reverse_inner=True, fast_mode=True, ) return ibuf_packed - def unpack_output(self, obuf_packed): + def unpack_output(self, obuf_packed, ind=0): """Unpacks the packed output buffer from accelerator. Gets packed output and returns output data in folded shape.""" obuf_folded = packed_bytearray_to_finnpy( obuf_packed, - self.odt, - self.oshape_folded, + self.odt(ind), + self.oshape_folded(ind), reverse_endian=True, reverse_inner=True, fast_mode=True, ) return obuf_folded - def unfold_output(self, obuf_folded): + def unfold_output(self, obuf_folded, ind=0): """Unfolds output data to normal shape. Gets folded output data and returns output data in normal shape.""" - obuf_normal = obuf_folded.reshape(self.oshape_normal) + obuf_normal = obuf_folded.reshape(self.oshape_normal(ind)) return obuf_normal - def copy_input_data_to_device(self, data): + def copy_input_data_to_device(self, data, ind=0): """Copies given input data to PYNQ buffer.""" - np.copyto(self.ibuf_packed_device, data) - self.ibuf_packed_device.flush() + np.copyto(self.ibuf_packed_device[ind], data) + self.ibuf_packed_device[ind].flush() - def copy_output_data_from_device(self, data): + def copy_output_data_from_device(self, data, ind=0): """Copies PYNQ output buffer from device.""" - self.obuf_packed_device.invalidate() - np.copyto(data, self.obuf_packed_device) + self.obuf_packed_device[ind].invalidate() + np.copyto(data, self.obuf_packed_device[ind]) def execute_on_buffers(self, asynch=False, batch_size=None): """Executes accelerator by setting up the DMA(s) on pre-allocated buffers. @@ -338,24 +345,36 @@ class FINNExampleOverlay(Overlay): batch_size = self.batch_size assert batch_size <= self.batch_size, "Specified batch_size is too large." if self.platform == "zynq-iodma": - assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle" + for o in range(self.num_outputs): + assert ( + self.odma[o].read(0x00) & 0x4 != 0 + ), "Output DMA %d is not idle" % (o) # manually launch IODMAs since signatures are missing for iwdma, iwbuf, iwdma_name in self.external_weights: iwdma.write(0x10, iwbuf.device_address) iwdma.write(0x1C, batch_size) iwdma.write(0x00, 1) - self.idma.write(0x10, self.ibuf_packed_device.device_address) - self.idma.write(0x1C, batch_size) - self.odma.write(0x10, self.obuf_packed_device.device_address) - self.odma.write(0x1C, batch_size) - self.idma.write(0x00, 1) - self.odma.write(0x00, 1) + for o in range(self.num_outputs): + self.odma[o].write(0x10, self.obuf_packed_device[o].device_address) + self.odma[o].write(0x1C, batch_size) + self.odma[o].write(0x00, 1) + for i in range(self.num_inputs): + self.idma[i].write(0x10, self.ibuf_packed_device[i].device_address) + self.idma[i].write(0x1C, batch_size) + self.idma[i].write(0x00, 1) elif self.platform == "alveo": - assert self.odma_handle is None, "Output DMA is already running" - self.idma.start(self.ibuf_packed_device, batch_size) + for o in range(self.num_outputs): + assert self.odma_handle[o] is None, ( + "Output DMA %d is already running" % o + ) + for i in range(self.num_inputs): + self.idma[i].start(self.ibuf_packed_device[i], batch_size) for iwdma, iwbuf, iwdma_name in self.external_weights: iwdma.start(iwbuf, batch_size) - self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size) + for o in range(self.num_outputs): + self.odma_handle[o] = self.odma[o].start( + self.obuf_packed_device[o], batch_size + ) else: raise Exception("Unrecognized platform: %s" % self.platform) # blocking behavior depends on asynch parameter @@ -363,31 +382,48 @@ class FINNExampleOverlay(Overlay): self.wait_until_finished() def wait_until_finished(self): - "Block until the output DMA has finished writing." + "Block until all output DMAs have finished writing." if self.platform == "zynq-iodma": # check if output IODMA is finished via register reads - status = self.odma.read(0x00) - while status & 0x2 == 0: - status = self.odma.read(0x00) + for o in range(self.num_outputs): + status = self.odma[o].read(0x00) + while status & 0x2 == 0: + status = self.odma[o].read(0x00) elif self.platform == "alveo": - assert self.odma_handle is not None, "No odma_handle to wait on" - self.odma_handle.wait() - self.odma_handle = None + assert all( + [x is not None for x in self.odma_handle] + ), "No odma_handle to wait on" + for o in range(self.num_outputs): + self.odma_handle[o].wait() + self.odma_handle[o] = None else: raise Exception("Unrecognized platform: %s" % self.platform) def execute(self, input_npy): - """Given input numpy array, first perform necessary packing and copying - to device buffers, execute on accelerator, then unpack output and return - output numpy array from accelerator.""" - ibuf_folded = self.fold_input(input_npy) - ibuf_packed = self.pack_input(ibuf_folded) - self.copy_input_data_to_device(ibuf_packed) + """Given a single or a list of input numpy array, first perform necessary + packing and copying to device buffers, execute on accelerator, then unpack + output and return output numpy array from accelerator.""" + # if single input, convert to list to normalize how we process the input + if not type(input_npy) is list: + input_npy = [input_npy] + assert self.num_inputs == len( + input_npy + ), "Not all accelerator inputs are specified." + for i in range(self.num_inputs): + ibuf_folded = self.fold_input(input_npy[i], ind=i) + ibuf_packed = self.pack_input(ibuf_folded, ind=i) + self.copy_input_data_to_device(ibuf_packed, ind=i) self.execute_on_buffers() - self.copy_output_data_from_device(self.obuf_packed) - obuf_folded = self.unpack_output(self.obuf_packed) - obuf_normal = self.unfold_output(obuf_folded) - return obuf_normal + outputs = [] + for o in range(self.num_outputs): + self.copy_output_data_from_device(self.obuf_packed[o], ind=o) + obuf_folded = self.unpack_output(self.obuf_packed[o], ind=o) + obuf_normal = self.unfold_output(obuf_folded, ind=o) + outputs.append(obuf_normal) + if self.num_outputs == 1: + return outputs[0] + else: + return outputs def throughput_test(self): """Run accelerator with empty inputs to measure throughput and other metrics. @@ -400,12 +436,14 @@ class FINNExampleOverlay(Overlay): runtime = end - start res["runtime[ms]"] = runtime * 1000 res["throughput[images/s]"] = self.batch_size / runtime - res["DRAM_in_bandwidth[Mb/s]"] = ( - np.prod(self.ishape_packed) * 0.000001 / runtime - ) - res["DRAM_out_bandwidth[Mb/s]"] = ( - np.prod(self.oshape_packed) * 0.000001 / runtime - ) + total_in = 0 + for i in range(self.num_inputs): + total_in += np.prod(self.ishape_packed(i)) + res["DRAM_in_bandwidth[Mb/s]"] = total_in * 0.000001 / runtime + total_out = 0 + for o in range(self.num_outputs): + total_out += np.prod(self.oshape_packed(o)) + res["DRAM_out_bandwidth[Mb/s]"] = total_out * 0.000001 / runtime for iwdma, iwbuf, iwdma_name in self.external_weights: res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = ( self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime @@ -416,11 +454,11 @@ class FINNExampleOverlay(Overlay): res["fclk[mhz]"] = self.clock_dict["clock0"]["frequency"] res["batch_size"] = self.batch_size # also benchmark driver-related overheads - input_npy = gen_finn_dt_tensor(self.idt, self.ishape_normal) + input_npy = gen_finn_dt_tensor(self.idt(), self.ishape_normal()) # provide as int8/uint8 to support fast packing path where possible - if self.idt == DataType.UINT8: + if self.idt() == DataType.UINT8: input_npy = input_npy.astype(np.uint8) - elif self.idt == DataType.INT8: + elif self.idt() == DataType.INT8: input_npy = input_npy.astype(np.int8) start = time.time() ibuf_folded = self.fold_input(input_npy) @@ -441,13 +479,13 @@ class FINNExampleOverlay(Overlay): res["copy_input_data_to_device[ms]"] = runtime * 1000 start = time.time() - self.copy_output_data_from_device(self.obuf_packed) + self.copy_output_data_from_device(self.obuf_packed[0]) end = time.time() runtime = end - start res["copy_output_data_from_device[ms]"] = runtime * 1000 start = time.time() - obuf_folded = self.unpack_output(self.obuf_packed) + obuf_folded = self.unpack_output(self.obuf_packed[0]) end = time.time() runtime = end - start res["unpack_output[ms]"] = runtime * 1000 diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py index 001744cba2b59f6d1a0a67fca3e2ad9668a519c0..1b29d4342c830ae896e580f602e810ee25ed234d 100644 --- a/src/finn/qnn-data/templates/driver/validate.py +++ b/src/finn/qnn-data/templates/driver/validate.py @@ -94,11 +94,11 @@ if __name__ == "__main__": test_labels = test_labels.reshape(n_batches, bsize) for i in range(n_batches): - ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape) + ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device[0].shape) exp = test_labels[i] driver.copy_input_data_to_device(ibuf_normal) driver.execute_on_buffers() - obuf_normal = np.empty_like(driver.obuf_packed_device) + obuf_normal = np.empty_like(driver.obuf_packed_device[0]) driver.copy_output_data_from_device(obuf_normal) ret = np.bincount(obuf_normal.flatten() == exp.flatten()) nok += ret[0] diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index c749d645dfbf9996c3eea430a0099cb5f12ee60a..3cb193055f3a455d95f7735ab38b2601809dbabd 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -61,7 +61,9 @@ class InferConvInpGen(Transformation): i2c_out_shape = model.get_tensor_shape(i2c_output) dt = model.get_tensor_datatype(i2c_input) if not dt.is_integer(): - warnings.warn("Input is not int. Can't infer ConvInpGen") + warnings.warn( + "%s : Input is not int. Can't infer ConvInpGen." % n.name + ) continue i2c_inst = getCustomOp(n) stride_h, stride_w = i2c_inst.get_nodeattr("stride") @@ -89,9 +91,10 @@ class InferConvInpGen(Transformation): # if padding enabled, ensure pad_val supported by DataType # assert dt.allowed(pad_val),"""FMPadding_Batch DataType # must support pad_val""" - assert ( - pad_val == 0 - ), "FMPadding_Batch doesn't currently support pad_val!= 0" + assert pad_val == 0, ( + "%s : FMPadding_Batch doesn't currently support pad_val!= 0" + % n.name + ) odim_padding_h = ifm_dim_h + pad_h odim_padding_w = ifm_dim_w + pad_w @@ -121,6 +124,7 @@ class InferConvInpGen(Transformation): NumChannels=ifm_ch, inputDataType=dt.name, SIMD=ifm_ch, + name="FMPadding_Batch_" + n.name, ) graph.node.insert(node_ind, padding_node) @@ -134,11 +138,15 @@ class InferConvInpGen(Transformation): ) if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - assert ( - is_square_image - ), "DownSampler currently only supports square input images." - assert is_equal_stride, """DownSampler currently only supports equal stride value + assert is_square_image, ( + "%s : DownSampler currently only supports square input images." + % n.name + ) + assert is_equal_stride, ( + """%s : DownSampler currently only supports equal stride value along different axes.""" + % n.name + ) ConvInpGen_idim = ConvInpGen_idim_h stride = stride_h # create DownSampler node @@ -153,6 +161,7 @@ class InferConvInpGen(Transformation): SIMD=ifm_ch, Stride=stride, inputDataType=dt.name, + name="DownSampler_" + n.name, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) else: @@ -160,12 +169,16 @@ class InferConvInpGen(Transformation): if ( is_square_image and is_square_kernel ): # square images and square kernels - assert is_equal_stride, """Non-equal strides along different axes is not supported + assert is_equal_stride, ( + """%s: Non-equal strides along different axes is not supported for (non-)square convolutions""" - assert ( - dilation_h == 1 and dilation_w == 1 - ), """Dilation value != 1 is not supported + % n.name + ) + assert dilation_h == 1 and dilation_w == 1, ( + """%s: Dilation value != 1 is not supported for square convolutions""" + % n.name + ) ConvInpGen_node = helper.make_node( "ConvolutionInputGenerator", [ConvInpGen_input], @@ -182,16 +195,19 @@ class InferConvInpGen(Transformation): inputDataType=dt.name, outputDataType=dt.name, depthwise=depthwise, + name="ConvolutionInputGenerator_" + n.name, ) else: # non-square images and/or kernels - assert ( - is_1d_convolution - ), "ConvultionInputGenerator1D works only for 1D convolutions" + assert is_1d_convolution, ( + "%s: ConvolutionInputGenerator1D works only for 1D convs" + % n.name + ) if dilation_h > 1 or dilation_w > 1: - assert ( - stride_h == 1 and stride_w == 1 - ), """Stride value of greater than 1 is not supported for convolutions + assert stride_h == 1 and stride_w == 1, ( + """%s: Stride value of greater than 1 is not supported for convolutions with dilation value greater than 1""" + % n.name + ) ConvInpGen_node = helper.make_node( "ConvolutionInputGenerator1D", [ConvInpGen_input], @@ -208,6 +224,7 @@ class InferConvInpGen(Transformation): inputDataType=dt.name, outputDataType=dt.name, depthwise=depthwise, + name="ConvolutionInputGenerator1D_" + n.name, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes @@ -219,6 +236,102 @@ class InferConvInpGen(Transformation): return (model, graph_modified) +class InferUpsample(Transformation): + """ + Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour_Batch nodes. + """ + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + # Extract mode and scales and input shape + mode = get_by_name(n.attribute, "mode").s.decode("ascii") + if n.op_type == "Upsample": + scales = model.get_initializer(n.input[1]) + else: + scales = model.get_initializer(n.input[2]) + in_shape = model.get_tensor_shape(n.input[0]) + + dt = model.get_tensor_datatype(n.input[0]) + if not dt.is_integer(): + warnings.warn( + "%s: Input not int. Can't infer UpsampleNearestNeighbour." + % n.name + ) + continue + + if model.get_tensor_layout(n.input[0]) != DataLayout.NHWC: + warnings.warn( + "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour." + % n.name + ) + continue + + # Check that the parameters are okay + assert mode == "nearest", ( + "%s: Upsampling is only supported for the mode nearest." % n.name + ) + assert len(in_shape) == 4, "Upsampling is only supported for 4D inputs." + assert scales.shape == (4,), ( + "%s: Upsampling is only supported for 4D scales." % n.name + ) + assert (scales >= 1).all(), ( + n.name + ": Upsampling is only supported for scales " + "which are larger or equal 1 in all dimensions." + ) + + # Assumes nhwc layout for scales and input + assert scales[1] == scales[2], ( + "%s: Upsampling is only supported for quadratic scales." % n.name + ) + assert scales[0] == scales[3] == 1, ( + n.name + ": Upsampling is only supported for scales with " + "the first and last dimensions being 1." + ) + spatial_scale = scales[1] + assert spatial_scale == int(spatial_scale), ( + "%s: Upsampling is only supported for integer scales." % n.name + ) + + assert in_shape[1] == in_shape[2], ( + "%s: Upsampling is only supported for quadratic input shapes." + % n.name + ) + + # Extract information for HLS node + IFMDim = in_shape[1] + OFMDim = int(round(in_shape[1] * spatial_scale)) + NumChannels = in_shape[-1] + numInputVectors = in_shape[0] + inputDataType = dt.name + + # Insert the HLSCustomOp node + Upsample_HLS_node = helper.make_node( + "UpsampleNearestNeighbour_Batch", + [n.input[0]], + [n.output[0]], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + OFMDim=OFMDim, + IFMDim=IFMDim, + NumChannels=NumChannels, + inputDataType=inputDataType, + numInputVectors=numInputVectors, + name="UpsampleNearestNeighbour_Batch_" + n.name, + ) + + # Remove the old node + graph.node.insert(node_ind, Upsample_HLS_node) + # remove old nodes + graph.node.remove(n) + graph_modified = True + return (model, graph_modified) + + class InferStreamingMaxPool(Transformation): """Convert MaxPoolNHWC layers to StreamingMaxPool layers.""" @@ -251,6 +364,7 @@ class InferStreamingMaxPool(Transformation): NumChannels=ifm_ch, ImgDim=(ifm_dim_h, ifm_dim_w), dataType=dt.name, + name="StreamingMaxPool_Batch_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -273,7 +387,7 @@ class InferPool_Batch(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type in ["MaxPool", "QuantAvgPool2d"]: + if n.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]: # extract pool parameters if n.op_type == "MaxPool": @@ -286,6 +400,15 @@ class InferPool_Batch(Transformation): k = inst.get_nodeattr("kernel") stride = inst.get_nodeattr("stride") dlayout = inst.get_nodeattr("data_layout") + elif n.op_type == "MaxPoolNHWC": + inst = getCustomOp(n) + k_shape = inst.get_nodeattr("kernel_shape") + strides = inst.get_nodeattr("strides") + assert k_shape[0] == k_shape[1] + assert strides[0] == strides[1] + k = k_shape[0] + stride = strides[0] + dlayout = "NHWC" try: pad = get_by_name(n.attribute, "pads").ints[-1] except AttributeError: @@ -302,7 +425,8 @@ class InferPool_Batch(Transformation): continue elif k == stride: warnings.warn( - """Inferring Pool_Batch node for k == stride. + n.name + + """: Inferring Pool_Batch node for k == stride. This case can be optimized. For example, for MaxPool run InferStreamingMaxPool before InferPool_Batch """ @@ -363,7 +487,7 @@ class InferPool_Batch(Transformation): accum_bits = 0 pool_size_param = k pad_value = 0 - if n.op_type == "MaxPool": + if n.op_type in ["MaxPool", "MaxPoolNHWC"]: pool_fxn = "MaxPool" odt = idt pad_value = idt.min() @@ -393,6 +517,7 @@ class InferPool_Batch(Transformation): pad_value=pad_value, depthwise=1, input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch), + name="Im2Col_" + n.name, ) # Warning PE has to be equal to ifm_ch until Im2Col is replaced by @@ -415,6 +540,7 @@ class InferPool_Batch(Transformation): AccumBits=accum_bits, Size=pool_size_param, BatchSize=1, + name="Pool_Batch_" + n.name, ) if dlayout == "NCHW": @@ -463,14 +589,16 @@ class InferBinaryStreamingFCLayer(Transformation): mm_output = n.output[0] mm_in_shape = model.get_tensor_shape(mm_input) mm_out_shape = model.get_tensor_shape(mm_output) - assert ( - model.get_tensor_datatype(mm_input) == DataType.BINARY - ), """First + assert model.get_tensor_datatype(mm_input) == DataType.BINARY, ( + n.name + + """: First input for xnorpopcount is not set to FINN DataType BINARY.""" - assert ( - model.get_tensor_datatype(mm_weight) == DataType.BINARY - ), """Second + ) + assert model.get_tensor_datatype(mm_weight) == DataType.BINARY, ( + n.name + + """: Second input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" + ) idt = DataType.BINARY wdt = DataType.BINARY mm_output = n.output[0] @@ -484,13 +612,12 @@ class InferBinaryStreamingFCLayer(Transformation): # create node with no parallelization first pe = 1 simd = 1 - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." wmem = mw * mh // (pe * simd) - assert ( - mw * mh == wmem * pe * simd - ), """Requirement (MW * MH) divisiable by + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisiable by (WMEM * PE * SIMD) is violated.""" + ) # see if we have any following thresholds consumer = model.find_consumer(mm_output) if consumer is not None and consumer.op_type == "MultiThreshold": @@ -500,10 +627,11 @@ class InferBinaryStreamingFCLayer(Transformation): mt_out_shape = model.get_tensor_shape(mt_output) mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) - assert ( - T.shape[0] == 1 or T.shape[0] == mh - ), """First dimension of + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of thresholds neither 1 nor MH.""" + ) odt = model.get_tensor_datatype(mt_output) if odt.bitwidth() == 1: # covers both bipolar and binary @@ -531,6 +659,7 @@ class InferBinaryStreamingFCLayer(Transformation): noActivation=0, numInputVectors=list(mm_in_shape[:-1]), mem_mode=self.mem_mode, + name=n.name, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -561,6 +690,7 @@ class InferBinaryStreamingFCLayer(Transformation): noActivation=1, numInputVectors=list(mm_in_shape[:-1]), mem_mode=self.mem_mode, + name=n.name, ) graph.node.insert(node_ind, new_node) # remove old node @@ -608,15 +738,12 @@ class InferQuantizedStreamingFCLayer(Transformation): # create node with no parallelization first pe = 1 simd = 1 - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - assert ( - mw % simd == 0 - ), "Requirement MW divisable by SIMD is violated." wmem = mw * mh // (pe * simd) - assert ( - mw * mh == wmem * pe * simd - ), """Requirement (MW * MH) divisible by + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisible by (WMEM * PE * SIMD) is violated.""" + ) # see if we have any following thresholds consumer = model.find_consumer(mm_output) if consumer is not None and consumer.op_type == "MultiThreshold": @@ -626,27 +753,30 @@ class InferQuantizedStreamingFCLayer(Transformation): mt_out_shape = model.get_tensor_shape(mt_output) mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) - assert ( - T.shape[0] == 1 or T.shape[0] == mh - ), """First dimension of + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of thresholds neither 1 nor MH.""" + ) odt = model.get_tensor_datatype(mt_output) scale = getCustomOp(consumer).get_nodeattr("out_scale") actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert ( - int(actval) == actval - ), "out_bias must be integer for HLS conversion." + assert int(actval) == actval, ( + consumer.name + + ": out_bias must be integer for HLS conversion." + ) actval = int(actval) odt_is_bipolar = odt == DataType.BIPOLAR bipolar_ok = ( odt_is_bipolar and (scale == 2.0) and (actval == -1) ) - assert ( - scale == 1.0 or bipolar_ok - ), "out_scale = 1.0 or bipolar output needed for conversion." - assert (not odt.signed()) or ( - actval < 0 - ), "Signed output requres actval < 0" + assert scale == 1.0 or bipolar_ok, ( + consumer.name + + ": out_scale=1 or bipolar output needed for conversion." + ) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) model.set_tensor_shape(mm_input, mm_in_shape) model.set_tensor_shape(mt_output, mt_out_shape) if bipolar_ok: @@ -672,6 +802,7 @@ class InferQuantizedStreamingFCLayer(Transformation): noActivation=0, numInputVectors=list(mm_in_shape[:-1]), mem_mode=self.mem_mode, + name="StreamingFCLayer_Batch_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -702,6 +833,7 @@ class InferQuantizedStreamingFCLayer(Transformation): noActivation=1, numInputVectors=list(mm_in_shape[:-1]), mem_mode=self.mem_mode, + name="StreamingFCLayer_Batch_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old node @@ -736,7 +868,8 @@ class InferVVAU(Transformation): k_h, k_w = sparsity["dw"]["kernel_shape"] except KeyError: raise Exception( - """Sparsity doesn't indicate that MatMul + n.name + + """: sparsity annotation doesn't indicate that MatMul belongs to a depthwise convolution.""" ) @@ -772,9 +905,6 @@ class InferVVAU(Transformation): model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) # create node with pe=channels as default pe = channels - assert ( - channels % pe == 0 - ), "Requirement Channels divisable by PE is violated." # see if we have any following thresholds consumer = model.find_consumer(mm_output) if consumer is not None and consumer.op_type == "MultiThreshold": @@ -783,23 +913,26 @@ class InferVVAU(Transformation): mt_out_shape = model.get_tensor_shape(mt_output) mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) - assert ( - T.shape[0] == 1 or T.shape[0] == channels - ), """First dimension of + assert T.shape[0] == 1 or T.shape[0] == channels, ( + consumer.name + + """: First dimension of thresholds neither 1 nor Channels.""" + ) odt = model.get_tensor_datatype(mt_output) scale = getCustomOp(consumer).get_nodeattr("out_scale") - assert ( - scale == 1.0 - ), "out_scale must be equal to 1.0 for HLS conversion." + assert scale == 1.0, ( + consumer.name + + ": out_scale must be equal to 1.0 for HLS conversion." + ) actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert ( - int(actval) == actval - ), "out_bias must be integer for HLS conversion." + assert int(actval) == actval, ( + consumer.name + + ": out_bias must be integer for HLS conversion." + ) actval = int(actval) - assert (not odt.signed()) or ( - actval < 0 - ), "Signed output requres actval < 0" + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) model.set_tensor_shape(mm_input, mm_in_shape) model.set_tensor_shape(mt_output, mt_out_shape) # create and insert new Vector_Vector_Activate_Batch node @@ -819,6 +952,7 @@ class InferVVAU(Transformation): outputDataType=odt.name, ActVal=actval, noActivation=0, + name="Vector_Vector_Activate_Batch_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -847,6 +981,7 @@ class InferVVAU(Transformation): outputDataType=odt.name, ActVal=0, noActivation=1, + name="Vector_Vector_Activate_Batch_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old node @@ -904,21 +1039,22 @@ class InferThresholdingLayer(Transformation): ifc = int(thl_in_shape[-1]) # create node with no parallelization first pe = 1 - assert ifc % pe == 0, "Requirement IFC divisable by PE is violated." odt = model.get_tensor_datatype(thl_output) scale = getCustomOp(node).get_nodeattr("out_scale") - assert ( - scale == 1.0 - ), "MultiThreshold out_scale must be equal to 1.0 for HLS conversion." + assert scale == 1.0, ( + node.name + + ": MultiThreshold out_scale must be 1 for HLS conversion." + ) actval = getCustomOp(node).get_nodeattr("out_bias") - assert ( - int(actval) == actval - ), "MultiThreshold out_bias must be integer for HLS conversion." + assert int(actval) == actval, ( + node.name + + ": MultiThreshold out_bias must be integer for HLS conversion." + ) actval = int(actval) - assert (not odt.signed()) or ( - actval < 0 - ), "Signed output requres actval < 0" + assert (not odt.signed()) or (actval < 0), ( + node.name + ": Signed output requres actval < 0" + ) # create and insert new Thresholding_Batch node new_node = helper.make_node( "Thresholding_Batch", @@ -935,6 +1071,7 @@ class InferThresholdingLayer(Transformation): numInputVectors=list(thl_in_shape[:-1]), ActVal=actval, mem_mode=self.mem_mode, + name="Thresholding_Batch_" + node.name, ) graph.node.insert(insert_point, new_node) # remove old node @@ -1008,9 +1145,6 @@ class InferAddStreamsLayer(Transformation): num_channels = int(in0_shape[-1]) # create node with no parallelization first pe = 1 - assert ( - num_channels % pe == 0 - ), "Requirement Channels divisable by PE is violated." # create and insert new StreamingFCLayer node new_node = helper.make_node( @@ -1023,6 +1157,7 @@ class InferAddStreamsLayer(Transformation): PE=pe, inputDataType=idt.name, numInputVectors=in0_shape[:-1], + name="AddStreams_Batch_" + node.name, ) graph.node.insert(insert_point, new_node) # remove old node @@ -1069,9 +1204,6 @@ class InferDuplicateStreamsLayer(Transformation): # create node with no parallelization first pe = 1 - assert ( - num_ch % pe == 0 - ), "Requirement channels divisable by PE is violated." dup_node = helper.make_node( "DuplicateStreams_Batch", @@ -1083,6 +1215,7 @@ class InferDuplicateStreamsLayer(Transformation): PE=pe, inputDataType=dt.name, numInputVectors=vecs, + name="DuplicateStreams_Batch_" + node.name, ) graph.node.insert(node_ind, dup_node) @@ -1251,6 +1384,7 @@ class InferChannelwiseLinearLayer(Transformation): paramDataType=pdt.name, outputDataType=odt.name, numInputVectors=list(ll_in_shape[:-1]), + name="ChannelwiseOp_Batch_" + node.name, ) graph.node.insert(insert_point, new_node) # remove old node @@ -1293,9 +1427,6 @@ class InferLabelSelectLayer(Transformation): num_inp_vecs = list(fc_in_shape[:-1]) # create node with no parallelization first pe = 1 - assert ( - num_labels % pe == 0 - ), "Requirement Labels divisable by PE is violated." k = model.get_initializer(k_input)[0] @@ -1311,6 +1442,7 @@ class InferLabelSelectLayer(Transformation): K=k, inputDataType=idt.name, numInputVectors=num_inp_vecs, + name="LabelSelect_Batch_" + node.name, ) graph.node.insert(node_ind, new_node) # remove old node @@ -1364,9 +1496,6 @@ class InferGlobalAccPoolLayer(Transformation): vecs = in0_shape[:-1] # create node with no parallelization first pe = 1 - assert ( - num_ch % pe == 0 - ), "Requirement Labels divisable by PE is violated." # create an additional tensor of the same shape and layout as result out_shape = model.get_tensor_shape(result) @@ -1387,6 +1516,7 @@ class InferGlobalAccPoolLayer(Transformation): PE=pe, inputDataType=idt.name, numInputVectors=vecs, + name="GlobalAccPool_Batch_" + node.name, ) mul_value = helper.make_tensor_value_info( diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py index 0aba60f9b6f08210c40f305694495b77f517f323..9b2577bc2b863e1075fc3252412ff1001b955cda 100644 --- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py +++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py @@ -26,11 +26,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import copy -from onnx import helper - +from finn.core.modelwrapper import ModelWrapper from finn.custom_op.registry import getCustomOp from finn.transformation.base import Transformation +from finn.transformation.create_generic_partitions import PartitionFromLambda +from finn.transformation.fpgadataflow.externalize_params import ExternalizeParams from finn.util.basic import get_by_name, make_build_dir @@ -41,120 +41,76 @@ class CreateDataflowPartition(Transformation): that indicates the filename for the second graph that only contains dataflow nodes. No action is taken if there are no dataflow nodes.""" - def __init__(self): + def __init__(self, partition_model_dir=None): super().__init__() + if partition_model_dir is None: + self.partition_model_dir = make_build_dir("dataflow_partition_") + else: + self.partition_model_dir = partition_model_dir def apply(self, model): - target_partition_id = 0 - # we currently assume that all dataflow nodes belonging to the same partition - # are connected to each other and there is a single input/output to/from each. - # NOTE: all dataflow nodes with no partition_id set are moved to partition 0 - # TODO: check the assumption and/or improve this. - while True: - all_nodes = list(model.graph.node) - df_nodes = filter( - lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes - ) - df_nodes = filter( - lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8") - == "fpgadataflow" - and ( - get_by_name(x.attribute, "partition_id") is None - or get_by_name(x.attribute, "partition_id").i == target_partition_id - ) - and x.op_type != "StreamingDataflowPartition", - df_nodes, - ) - df_nodes = list(df_nodes) - non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes) - non_df_nodes = list(non_df_nodes) - - if len(df_nodes) == 0: - # no changes if no dataflow nodes are present - break - else: - # partition the model into two models - df_model = copy.deepcopy(model) - non_df_model = model - # remove all non-dataflow nodes from the dataflow model - for node_to_remove in non_df_nodes: - df_model.graph.node.remove(node_to_remove) - # identify the entry and exit points for the dataflow part - df_in = df_model.graph.node[0].input[0] - df_out = df_model.graph.node[-1].output[0] - df_in_vi = df_model.get_tensor_valueinfo(df_in) - df_out_vi = df_model.get_tensor_valueinfo(df_out) - # set df graph in/out to be df_in/df_out - df_model.graph.input.remove(df_model.graph.input[0]) - df_model.graph.input.insert(0, df_in_vi) - df_model.graph.output.remove(df_model.graph.output[0]) - df_model.graph.output.insert(0, df_out_vi) - # parse StreamingFCLayers looking for external weight memories - fc_extw_nodes = filter( - lambda x: x.op_type == "StreamingFCLayer_Batch" - and get_by_name(x.attribute, "mem_mode") is not None - and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") - == "external", - df_nodes, - ) - fc_extw_nodes = list(fc_extw_nodes) - extra_df_inputs = [] + def filter_fc_extw(x): + if x.op_type == "IODMA": + burst_mode = get_by_name(x.attribute, "burstMode") + if burst_mode is not None: + burst_mode = burst_mode.s.decode("UTF-8") + return burst_mode == "wrap" - for i in range(len(fc_extw_nodes)): - fc_weight_vi = df_model.get_tensor_valueinfo( - fc_extw_nodes[i].input[1] - ) - df_model.graph.input.insert(i + 1, fc_weight_vi) - extra_df_inputs.append(fc_extw_nodes[i].input[1]) + extw_dma_nodes = list(filter(filter_fc_extw, model.graph.node)) + if len(extw_dma_nodes) > 0: + model = model.transform(ExternalizeParams()) - # save model - df_model_dir = make_build_dir( - "dataflow_partition" + str(target_partition_id) + "_" - ) - df_model_filename = df_model_dir + "/df_model.onnx" - df_model.cleanup() - df_model.save(df_model_filename) - # remove all dataflow nodes from the non-dataflow model - # keep track of where the dataflow part starts - df_start_ind = all_nodes.index(df_nodes[0]) - - # get and check floorplan - inst = getCustomOp(df_nodes[0]) - slr = inst.get_nodeattr("slr") - for node in df_nodes[1:]: - inst = getCustomOp(node) - assert slr == inst.get_nodeattr( - "slr" - ), """all nodes with - same partition_id must have the same slr id""" - - # check that there is only one non-null mem_port per partition - nmemports = 0 - mem_port = "" - for node in df_nodes: - inst = getCustomOp(node) - port = inst.get_nodeattr("mem_port") - if port is not None and port != "": - nmemports += 1 - mem_port = port - assert nmemports <= 1, """too many memory ports per partition""" + def assign_partition_id(node): + if node.op_type in ["GenericPartition", "StreamingDataflowPartition"]: + return -1 + else: + backend = get_by_name(node.attribute, "backend") + if backend is not None and backend.s.decode("UTF-8") == "fpgadataflow": + assigned_partition = get_by_name(node.attribute, "partition_id") + if assigned_partition is not None: + return assigned_partition.i + else: + return 0 + else: + return -1 - for node_to_remove in df_nodes: - non_df_model.graph.node.remove(node_to_remove) - # create StreamingDataflow node with df_in/df_out io - df_node = helper.make_node( - "StreamingDataflowPartition", - [df_in] + extra_df_inputs, - [df_out], - # use the model attribute to mark the df model - model=df_model_filename, - domain="finn.custom_op.fpgadataflow", - partition_id=target_partition_id, - slr=slr, - mem_port=mem_port, - ) - non_df_model.graph.node.insert(df_start_ind, df_node) - model = non_df_model - target_partition_id += 1 + # first, use the generic partitioning functionality to split up the graph + parent_model = model.transform( + PartitionFromLambda( + partitioning=assign_partition_id, partition_dir=self.partition_model_dir + ) + ) + # change node types to StreamingDataflowPartition + p_nodes = parent_model.get_nodes_by_op_type("GenericPartition") + for partition_ind, p_node in enumerate(p_nodes): + # go into partition to extract some info + p_node_inst = getCustomOp(p_node) + node_model_filename = p_node_inst.get_nodeattr("model") + p_model = ModelWrapper(node_model_filename) + # check floorplan (SLR assignment per node) + inst = getCustomOp(p_model.graph.node[0]) + slr = inst.get_nodeattr("slr") + for node in p_model.graph.node: + inst = getCustomOp(node) + assert slr == inst.get_nodeattr( + "slr" + ), """all nodes with same partition_id must have the same slr id""" + # check that there is only one non-null mem_port per partition + nmemports = 0 + mem_port = "" + for node in p_model.graph.node: + inst = getCustomOp(node) + port = inst.get_nodeattr("mem_port") + if port is not None and port != "": + nmemports += 1 + mem_port = port + assert nmemports <= 1, """Too many memory ports per partition""" + # done, change node type and add info in parent graph + p_node.op_type = "StreamingDataflowPartition" + p_node.domain = "finn.custom_op.fpgadataflow" + new_p_node_inst = getCustomOp(p_node) + new_p_node_inst.set_nodeattr("partition_id", partition_ind) + new_p_node_inst.set_nodeattr("slr", slr) + new_p_node_inst.set_nodeattr("mem_port", mem_port) - return (model, False) + return (parent_model, False) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 74bc7395512425e6b81defe5cec4afaa4b669e90..c2ded29d24eb1c75fcba8b4053c78920a31b1d3b 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -86,11 +86,6 @@ class CreateStitchedIP(Transformation): self.clk_ns = clk_ns self.ip_name = ip_name self.vitis = vitis - if float(clk_ns) not in [5.0, 10.0, 20.0]: - warnings.warn( - """The chosen frequency may lead to failure due to clock divider - constraints.""" - ) self.has_aximm = False self.has_m_axis = False self.m_axis_idx = 0 @@ -221,6 +216,13 @@ class CreateStitchedIP(Transformation): ip_dirs = ["list"] # add RTL streamer IP ip_dirs.append("/workspace/finn/finn-rtllib/memstream") + if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]: + warnings.warn( + """First node is not StreamingFIFO or IODMA. + You may experience incorrect stitched-IP rtlsim or hardware + behavior. It is strongly recommended to insert FIFOs prior to + calling CreateStitchedIP.""" + ) # ensure that all nodes are fpgadataflow, and that IPs are generated for node in model.graph.node: assert is_fpgadataflow_node( @@ -333,8 +335,8 @@ class CreateStitchedIP(Transformation): # if targeting Vitis, add some properties to the IP if self.vitis: tcl.append( - "ipx::remove_bus_parameter FREQ_HZ " - "[ipx::get_bus_interfaces CLK.AP_CLK -of_objects [ipx::current_core]]" + "set_property value_resolve_type user [ipx::get_bus_parameters " + "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]" ) # replace source code with dcp tcl.append( diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py new file mode 100644 index 0000000000000000000000000000000000000000..dcb66a8538fdff46214c23491f48a59459625082 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/externalize_params.py @@ -0,0 +1,75 @@ +# Copyright (c) 2021, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from finn.transformation.base import Transformation +from finn.util.basic import get_by_name + + +class ExternalizeParams(Transformation): + """Create top-level graph inputs for IODMAs serving layers where weights are + marked as external using mem_mode="external".""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph_modified = False + + def filter_fc_extw(x): + if x.op_type == "IODMA": + burst_mode = get_by_name(x.attribute, "burstMode") + if burst_mode is not None: + burst_mode = burst_mode.s.decode("UTF-8") + return burst_mode == "wrap" + + dma_extw_nodes = list(filter(filter_fc_extw, model.graph.node)) + + for dma_extw in dma_extw_nodes: + extw_tensor_name = dma_extw.input[0] + extw_tensor_name_out = dma_extw.output[0] + if extw_tensor_name in [x.name for x in model.graph.input]: + continue + else: + extw_vi = model.get_tensor_valueinfo(extw_tensor_name) + assert extw_vi is not None + model.graph.value_info.remove(extw_vi) + model.graph.input.append(extw_vi) + iodma_init = model.get_initializer(extw_vi.name) + assert iodma_init is not None + # remove output-side initializer to get correct dataflow partitioning + model.graph.initializer.remove( + [ + x + for x in model.graph.initializer + if x.name == extw_tensor_name_out + ][0] + ) + graph_modified = True + + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index ef56db6376703ce1eb0134c173de61a562bca6e6..c8bb716922823876f5f16ffe62f17c425d49aa74 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -180,49 +180,50 @@ class InsertFIFO(Transformation): n.input[0] = fifo_output_tensor.name # insert FIFO as last node, except when last node is DMA - if ( - graph.node[-1].op_type != "StreamingFIFO" - and graph.node[-1].op_type != "IODMA" - ): - n = graph.node[-1] - assert ( - n.op_type != "TLastMarker" - ), """Insert tlast marker should be done - after inserting the FIFOs""" - graph_out_name = graph.output[0].name - n0 = getCustomOp(n) - # determine fifo node attributes - fld_shape = n0.get_folded_output_shape() - dtype = n0.get_output_datatype() - fifo_depth = n0.get_nodeattr("outFIFODepth") - - if fifo_depth <= 2: - warnings.warn("Overriding output FIFO depth to 32") - fifo_depth = 32 - - # create fifo node - fifo_input_tensor = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - n0.get_normal_output_shape(), - ) - graph.value_info.append(fifo_input_tensor) - model.set_tensor_datatype(fifo_input_tensor.name, dtype) - - fifo_node = oh.make_node( - "StreamingFIFO", - [fifo_input_tensor.name], - [graph_out_name], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - depth=fifo_depth, - folded_shape=fld_shape, - dataType=str(dtype.name), - ) - # insert fifo - graph.node.append(fifo_node) - - # set fifo output tensor as new input tensor of second node - n.output[0] = fifo_input_tensor.name + graph_out_names = [x.name for x in model.graph.output] + for graph_out_name in graph_out_names: + final_node = model.find_producer(graph_out_name) + if ( + final_node.op_type != "StreamingFIFO" + and final_node.op_type != "IODMA" + ): + assert ( + final_node.op_type != "TLastMarker" + ), """Insert tlast marker should be done + after inserting the FIFOs""" + n0 = getCustomOp(final_node) + # determine fifo node attributes + fld_shape = n0.get_folded_output_shape() + dtype = n0.get_output_datatype() + fifo_depth = n0.get_nodeattr("outFIFODepth") + + if fifo_depth <= 2: + warnings.warn("Overriding output FIFO depth to 32") + fifo_depth = 32 + + # create fifo node + fifo_input_tensor = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + n0.get_normal_output_shape(), + ) + graph.value_info.append(fifo_input_tensor) + model.set_tensor_datatype(fifo_input_tensor.name, dtype) + + fifo_node = oh.make_node( + "StreamingFIFO", + [fifo_input_tensor.name], + [graph_out_name], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + depth=fifo_depth, + folded_shape=fld_shape, + dataType=str(dtype.name), + ) + # insert fifo + graph.node.append(fifo_node) + + # set fifo output tensor as new input tensor of second node + final_node.output[0] = fifo_input_tensor.name return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index d4b2a1032aeb305c85ffb535ac821692ce747c18..d0ef270816c362af730a75b59be71d0457e0b8e2 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -87,6 +87,7 @@ class InsertIODMA(Transformation): return reshaped_w def apply(self, model): + modified = False # only makes sense for a pure fpgadataflow graph -- so we check! all_nodes = list(model.graph.node) assert all( @@ -102,59 +103,14 @@ class InsertIODMA(Transformation): all_nodes, ) ) - graph_in_name = model.graph.input[0].name - first_node = model.find_consumer(graph_in_name) - graph_out_name = model.graph.output[0].name - final_node = model.find_producer(graph_out_name) - if ( - final_node.op_type == "IODMA" - and first_node.op_type == "IODMA" - and len(fc_extw_nodes) == 0 - ): - # TODO maybe check the correctness of properties - return (model, False) - else: - if final_node.op_type != "IODMA": - out_shape = model.get_tensor_shape(graph_out_name) - out_dtype = model.get_tensor_datatype(graph_out_name) - final_node_inst = getCustomOp(final_node) - out_folded_shape = final_node_inst.get_folded_output_shape() - # take advantage of AXI stream width padding for DMA alignment - # (AXI streams are always padded to 8 bits) - # this is the width of stream input to DMA - padded_outstream_width = final_node_inst.get_outstream_width_padded() - padded_outstream_bytes = padded_outstream_width // 8 - # determine the feasible interface width - transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1]) - intfwidth = math.gcd(transfer_bits, self.max_intfwidth) - assert ( - intfwidth % 8 == 0 - ), "No feasible interface width for transfer size" - # make new buffer - final_node_out = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(final_node_out) - model.set_tensor_datatype(final_node_out.name, out_dtype) - # reroute final node output to final_node_out_name - final_node.output[0] = final_node_out.name - # FIXME: currently always using 8-bit dtypes to work around the - # padding problems for i/o DMA - dma_node = oh.make_node( - "IODMA", - [final_node_out.name], - [graph_out_name], - numInputVectors=out_folded_shape[:-1], - NumChannels=padded_outstream_bytes, - dataType="UINT8", - intfWidth=intfwidth, - streamWidth=padded_outstream_width, - direction="out", - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ) - model.graph.node.append(dma_node) - if first_node.op_type != "IODMA": + # insert IODMAs for graph inputs + graph_in_names = [x.name for x in model.graph.input] + for graph_in_name in graph_in_names: + first_node = model.find_consumer(graph_in_name) + if first_node.op_type == "IODMA": + # IODMA already inserted for this input + continue + else: in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) first_node_inst = getCustomOp(first_node) @@ -194,47 +150,96 @@ class InsertIODMA(Transformation): backend="fpgadataflow", ) model.graph.node.insert(0, dma_node) - for fc_node in fc_extw_nodes: - fc_inst = getCustomOp(fc_node) - fc_w_name = fc_node.input[1] - w_shape = model.get_tensor_shape(fc_w_name) - w_dtype = model.get_tensor_datatype(fc_w_name) + modified = True + # insert IODMAs for graph outputs + graph_out_names = [x.name for x in model.graph.output] + for graph_out_name in graph_out_names: + final_node = model.find_producer(graph_out_name) + if final_node.op_type == "IODMA": + continue + else: + out_shape = model.get_tensor_shape(graph_out_name) + out_dtype = model.get_tensor_datatype(graph_out_name) + final_node_inst = getCustomOp(final_node) + out_folded_shape = final_node_inst.get_folded_output_shape() + # take advantage of AXI stream width padding for DMA alignment + # (AXI streams are always padded to 8 bits) + # this is the width of stream input to DMA + padded_outstream_width = final_node_inst.get_outstream_width_padded() + padded_outstream_bytes = padded_outstream_width // 8 # determine the feasible interface width - transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() + transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0 ), "No feasible interface width for transfer size" - # calculate width of stream output from DMA - pe = get_by_name(fc_node.attribute, "PE").i - simd = get_by_name(fc_node.attribute, "SIMD").i - streamWidth = fc_inst.get_weightstream_width_padded() # make new buffer - W = model.get_initializer(fc_w_name) - iodma_mem = self.get_mem_init(W, pe, simd) - model.set_initializer(fc_w_name, iodma_mem) - - fc_node_in = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape + final_node_out = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape ) - model.graph.value_info.append(fc_node_in) - model.set_tensor_datatype(fc_node_in.name, w_dtype) - model.set_initializer(fc_node_in.name, W) + model.graph.value_info.append(final_node_out) + model.set_tensor_datatype(final_node_out.name, out_dtype) + # reroute final node output to final_node_out_name + final_node.output[0] = final_node_out.name + # FIXME: currently always using 8-bit dtypes to work around the + # padding problems for i/o DMA dma_node = oh.make_node( "IODMA", - [fc_w_name], - [fc_node_in.name], - numInputVectors=[iodma_mem.shape[0]], - NumChannels=pe * simd, - dataType=str(w_dtype.name), + [final_node_out.name], + [graph_out_name], + numInputVectors=out_folded_shape[:-1], + NumChannels=padded_outstream_bytes, + dataType="UINT8", intfWidth=intfwidth, - streamWidth=streamWidth, - direction="in", - burstMode="wrap", + streamWidth=padded_outstream_width, + direction="out", domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", ) - fc_node.input[1] = fc_node_in.name - model.graph.node.insert(0, dma_node) + model.graph.node.append(dma_node) + modified = True + + for fc_node in fc_extw_nodes: + fc_inst = getCustomOp(fc_node) + fc_w_name = fc_node.input[1] + w_shape = model.get_tensor_shape(fc_w_name) + w_dtype = model.get_tensor_datatype(fc_w_name) + # determine the feasible interface width + transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert intfwidth % 8 == 0, "No feasible interface width for transfer size" + # calculate width of stream output from DMA + pe = get_by_name(fc_node.attribute, "PE").i + simd = get_by_name(fc_node.attribute, "SIMD").i + streamWidth = fc_inst.get_weightstream_width_padded() + # make new buffer + W = model.get_initializer(fc_w_name) + iodma_mem = self.get_mem_init(W, pe, simd) + model.set_initializer(fc_w_name, iodma_mem) + + fc_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape + ) + model.graph.value_info.append(fc_node_in) + model.set_tensor_datatype(fc_node_in.name, w_dtype) + model.set_initializer(fc_node_in.name, W) + dma_node = oh.make_node( + "IODMA", + [fc_w_name], + [fc_node_in.name], + numInputVectors=[iodma_mem.shape[0]], + NumChannels=pe * simd, + dataType=str(w_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="in", + burstMode="wrap", + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ) + fc_node.input[1] = fc_node_in.name + model.graph.node.insert(0, dma_node) + modified = True + if modified: model = model.transform(SortGraph()) - return (model, True) + return (model, modified) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index be2176a34763fdb5521a0acdfc3137fb4b4a766e..bfa2fdbf9594c52d9a3a2376312a929b9008c9ea 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -90,6 +90,7 @@ class MakePYNQDriver(Transformation): self.platform = platform def apply(self, model): + # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) @@ -100,59 +101,100 @@ class MakePYNQDriver(Transformation): ) driver_base_py = pynq_driver_dir + "/driver_base.py" shutil.copy(driver_base_template, driver_base_py) - # extract input-output shapes from the graph # TODO convert this to an analysis pass? - i_tensor_name = model.graph.input[0].name - o_tensor_name = model.graph.output[0].name - i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name)) - o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) - i_tensor_dt = model.get_tensor_datatype(i_tensor_name) - o_tensor_dt = model.get_tensor_datatype(o_tensor_name) - - first_node = model.find_consumer(i_tensor_name) - last_node = model.find_producer(o_tensor_name) - if first_node.op_type == "StreamingDataflowPartition": - # IODMAs and dataflow partitions have already been created - # extract folded i/o shapes from IODMA consumer/producer - first_df_model = ModelWrapper(getCustomOp(first_node).get_nodeattr("model")) + idt = [] + idma_names = [] + ishape_normal = [] + ishape_folded = [] + ishape_packed = [] + for idma_ind, graph_in in enumerate(model.graph.input): + i_tensor_name = graph_in.name + # get inp tensor properties + i_tensor_dt = model.get_tensor_datatype(i_tensor_name) + i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name)) + # go down into dataflow partition to get folded shape info etc + # TODO consider setting these as attributes during dataflow partitioning + i_consumer = model.find_consumer(i_tensor_name) + assert ( + i_consumer.op_type == "StreamingDataflowPartition" + ), """ + Ensure CreateDataflowPartition called before driver creation.""" + first_df_model = ModelWrapper(getCustomOp(i_consumer).get_nodeattr("model")) assert ( first_df_model.graph.node[0].op_type == "IODMA" ), "First partition must hold input IODMA" - successors = model.find_direct_successors(first_node) + successors = model.find_direct_successors(i_consumer) + successor_input_num = list(successors[0].input).index(i_consumer.output[0]) successor_sdp = getCustomOp(successors[0]) successor_df_model = ModelWrapper(successor_sdp.get_nodeattr("model")) first_node = successor_df_model.find_consumer( - successor_df_model.graph.input[0].name + successor_df_model.graph.input[successor_input_num].name ) - - last_df_model = ModelWrapper(getCustomOp(last_node).get_nodeattr("model")) + i_tensor_shape_folded = tuple( + getCustomOp(first_node).get_folded_input_shape() + ) + # generate dummy folded i/o tensors and their packed versions + i_tensor_dummy_folded = gen_finn_dt_tensor( + i_tensor_dt, i_tensor_shape_folded + ) + i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( + i_tensor_dummy_folded, i_tensor_dt + ) + i_tensor_shape_packed = i_tensor_dummy_packed.shape + # append all input tensor info to relevant lists + idt.append(str(i_tensor_dt)) + ishape_normal.append(i_tensor_shape_normal) + ishape_folded.append(i_tensor_shape_folded) + ishape_packed.append(i_tensor_shape_packed) + idma_names.append(getCustomOp(i_consumer).get_nodeattr("instance_name")) + + odt = [] + odma_names = [] + oshape_normal = [] + oshape_folded = [] + oshape_packed = [] + for odma_ind, graph_out in enumerate(model.graph.output): + o_tensor_name = graph_out.name + # get inp tensor properties + o_tensor_dt = model.get_tensor_datatype(o_tensor_name) + o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) + # go down into IODMA partition to get folded shape info etc + # TODO consider setting these as attributes during dataflow partitioning + o_producer = model.find_producer(o_tensor_name) + assert ( + o_producer.op_type == "StreamingDataflowPartition" + ), """ + Ensure CreateDataflowPartition called before driver creation.""" + df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model")) assert ( - last_df_model.graph.node[0].op_type == "IODMA" - ), "Last partition must hold output IODMA" - predecessors = model.find_direct_predecessors(last_node) + df_model.graph.node[-1].op_type == "IODMA" + ), "Partition must hold output IODMA" + predecessors = model.find_direct_predecessors(o_producer) + predecessor_output_num = list(predecessors[0].output).index( + o_producer.input[0] + ) predecessor_sdp = getCustomOp(predecessors[0]) predecessor_df_model = ModelWrapper(predecessor_sdp.get_nodeattr("model")) last_node = predecessor_df_model.find_producer( - predecessor_df_model.graph.output[0].name + predecessor_df_model.graph.output[predecessor_output_num].name ) - - # else: transformation called before IODMA/SDP creation (legacy flow) - # can access folded i/o shapes directly - i_tensor_shape_folded = tuple(getCustomOp(first_node).get_folded_input_shape()) - o_tensor_shape_folded = tuple(getCustomOp(last_node).get_folded_output_shape()) - - # generate dummy folded i/o tensors and their packed versions - i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded) - o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded) - i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( - i_tensor_dummy_folded, i_tensor_dt - ) - o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( - o_tensor_dummy_folded, o_tensor_dt - ) - i_tensor_shape_packed = i_tensor_dummy_packed.shape - o_tensor_shape_packed = o_tensor_dummy_packed.shape + o_tensor_shape_folded = tuple( + getCustomOp(last_node).get_folded_output_shape() + ) + o_tensor_dummy_folded = gen_finn_dt_tensor( + o_tensor_dt, o_tensor_shape_folded + ) + o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( + o_tensor_dummy_folded, o_tensor_dt + ) + o_tensor_shape_packed = o_tensor_dummy_packed.shape + # append all output tensor info to relevant lists + odt.append(str(o_tensor_dt)) + oshape_normal.append(o_tensor_shape_normal) + oshape_folded.append(o_tensor_shape_folded) + oshape_packed.append(o_tensor_shape_packed) + odma_names.append(getCustomOp(o_producer).get_nodeattr("instance_name")) # generate external weights npy files weights_dir = pynq_driver_dir + "/runtime_weights" @@ -166,47 +208,50 @@ class MakePYNQDriver(Transformation): node.op_type == "StreamingDataflowPartition" ), "CreateDataflowPartition needs to be applied before driver generation" - producer = model.find_producer(node.input[0]) - init_tensor = model.get_initializer(node.input[0]) + if len(node.input) > 0: + producer = model.find_producer(node.input[0]) + init_tensor = model.get_initializer(node.input[0]) + else: + producer = None + init_tensor = None if producer is None: # input dma? - idma_name = "idma" + str(idma_idx) - if init_tensor is not None: # input weights dma? + sdp_inst = getCustomOp(node) + idma_name = sdp_inst.get_nodeattr("instance_name") + df_model = ModelWrapper(sdp_inst.get_nodeattr("model")) + assert df_model.graph.node[0].op_type == "IODMA" + iodma_node = getCustomOp(df_model.graph.node[0]) + if iodma_node.get_nodeattr("burstMode") == "wrap": # input weights dma? + init_tensor = df_model.get_initializer( + iodma_node.onnx_node.input[0] + ) ext_weight_dma_cnt += 1 - w_dtype = model.get_tensor_datatype(node.input[0]) + w_dtype = df_model.get_tensor_datatype( + iodma_node.onnx_node.input[0] + ) init_external_tensor = to_external_tensor(init_tensor, w_dtype) np.save( weights_dir + "/" + idma_name + ".npy", init_external_tensor ) - else: - net_input_name = idma_name - idma_idx += 1 # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = template_driver.pynq_driver_template - def mss(x, batch_var_name="1"): - # "make shape string" - # for a shape like (1, ...) emit a string (N, ...) - # where N is the default value for batch_var_name - # this lets the driver work with a batch of samples at once - ret = str(x) - ret = ret.replace("(1,", "(%s," % batch_var_name) - ret = ret.replace("[1,", "[%s," % batch_var_name) - return ret - driver = driver.replace("$PLATFORM$", self.platform) - driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt)) - driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal)) - driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded)) - driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed)) - driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt)) - driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal)) - driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) - driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) - driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name) + driver = driver.replace("$INPUT_FINN_DATATYPE$", str(idt).replace("'", "")) + driver = driver.replace("$INPUT_SHAPE_NORMAL$", str(ishape_normal)) + driver = driver.replace("$INPUT_SHAPE_FOLDED$", str(ishape_folded)) + driver = driver.replace("$INPUT_SHAPE_PACKED$", str(ishape_packed)) + driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(odt).replace("'", "")) + driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", str(oshape_normal)) + driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", str(oshape_folded)) + driver = driver.replace("$OUTPUT_SHAPE_PACKED$", str(oshape_packed)) + driver = driver.replace("$INPUT_DMA_NAME$", "%s" % str(idma_names)) + driver = driver.replace("$OUTPUT_DMA_NAME$", "%s" % str(odma_names)) + driver = driver.replace("$NUM_INPUTS$", str(len(idma_names))) + driver = driver.replace("$NUM_OUTPUTS$", str(len(odma_names))) driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt)) with open(driver_py, "w") as f: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 0f4dd2673c026f4e9179c2ab6e860ca04ced68eb..84d587b6cecea63cb3be41a4a73bcc24aeb822f3 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -42,11 +42,10 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA -from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames from finn.transformation.infer_data_layouts import InferDataLayouts -from finn.util.basic import get_by_name, make_build_dir, pynq_part_map +from finn.util.basic import make_build_dir, pynq_part_map from . import templates @@ -54,19 +53,22 @@ from . import templates def collect_ip_dirs(model, ipstitch_path): # collect list of all IP dirs ip_dirs = [] + need_memstreamer = False for node in model.graph.node: - ip_dir_attribute = get_by_name(node.attribute, "ip_path") - assert ( - ip_dir_attribute is not None - ), """Node attribute "ip_path" is - empty. Please run transformation HLSSynth_ipgen first.""" - ip_dir_value = ip_dir_attribute.s.decode("UTF-8") + node_inst = getCustomOp(node) + ip_dir_value = node_inst.get_nodeattr("ip_path") assert os.path.isdir( ip_dir_value ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] + if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]: + if node_inst.get_nodeattr("mem_mode") == "decoupled": + need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] + if need_memstreamer: + # add RTL streamer IP + ip_dirs.append("/workspace/finn/finn-rtllib/memstream") return ip_dirs @@ -142,16 +144,21 @@ class MakeZYNQProject(Transformation): # assume only one connection from each ip to the next # all aximm allocated to DDR[0] # all kernels allocated to SLR0 - producer = model.find_producer(node.input[0]) + if len(node.input) == 0: + producer = None + else: + producer = model.find_producer(node.input[0]) consumer = model.find_consumers(node.output[0]) # define kernel instances # name kernels connected to graph inputs as idmaxx - # name kernels connected to graph inputs as odmaxx + # name kernels connected to graph outputs as odmaxx if producer is None or consumer is None: if producer is None: instance_names[node.name] = "idma" + str(idma_idx) + idma_idx += 1 elif consumer is None: instance_names[node.name] = "odma" + str(odma_idx) + odma_idx += 1 config.append( "create_bd_cell -type ip -vlnv %s %s" % (vivado_stitch_vlnv, instance_names[node.name]) @@ -176,7 +183,7 @@ class MakeZYNQProject(Transformation): "assign_axi_addr_proc %s/%s" % (instance_names[node.name], axilite_intf_name) ) - idma_idx += 1 + aximm_idx += 1 axilite_idx += 1 else: @@ -197,6 +204,7 @@ class MakeZYNQProject(Transformation): % (instance_names[node.name], axilite_intf_name) ) axilite_idx += 1 + sdp_node.set_nodeattr("instance_name", instance_names[node.name]) config.append( "connect_bd_net [get_bd_pins %s/ap_clk] " @@ -295,12 +303,19 @@ class ZynqBuild(Transformation): """ - def __init__(self, platform, period_ns, enable_debug=False): + def __init__( + self, + platform, + period_ns, + enable_debug=False, + partition_model_dir=None, + ): super().__init__() self.fpga_part = pynq_part_map[platform] self.period_ns = period_ns self.platform = platform self.enable_debug = enable_debug + self.partition_model_dir = partition_model_dir def apply(self, model): # first infer layouts @@ -310,7 +325,7 @@ class ZynqBuild(Transformation): InsertIODMA(64), InsertDWC(), Floorplan(), - CreateDataflowPartition(), + CreateDataflowPartition(partition_model_dir=self.partition_model_dir), ] for trn in prep_transforms: model = model.transform(trn) @@ -332,7 +347,7 @@ class ZynqBuild(Transformation): kernel_model = kernel_model.transform(HLSSynthIP()) kernel_model = kernel_model.transform( CreateStitchedIP( - self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True + self.fpga_part, self.period_ns, sdp_node.onnx_node.name, False ) ) kernel_model.set_metadata_prop("platform", "zynq-iodma") @@ -345,6 +360,4 @@ class ZynqBuild(Transformation): # set platform attribute for correct remote execution model.set_metadata_prop("platform", "zynq-iodma") - # create driver - model = model.transform(MakePYNQDriver(platform="zynq-iodma")) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index c06c34574aa22a23d1307232b0fd8e65224f1983..39eb049565475b462ea0df9d88b46e3598e6cdd9 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -87,9 +87,14 @@ class RemoveShallowFIFOs(Transformation): def apply(self, model): shallow_fifos = [] for node in model.graph.node: + if len(node.input) > 0: + is_first_node = model.find_producer(node.input[0]) is None + else: + is_first_node = True if ( node.op_type == "StreamingFIFO" and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold + and (not is_first_node) ): # bypass shallow fifos shallow_fifos.append(node) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 914dda9554395fc89cac8692e13339ae3ce9baf7..64d7a080724820d58a026bafbe74a4d7567b2179 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np import warnings from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance @@ -154,9 +155,16 @@ class SetFolding(Transformation): pe = node_inst.get_nodeattr("PE") swu_node_inst.set_nodeattr("SIMD", pe) else: - raise Exception( - "Expected SWU on DW op input, found " + swu_node.op_type - ) + if op_type == "Vector_Vector_Activate_Batch": + ksize = np.prod(node_inst.get_nodeattr("Kernel")) + elif op_type == "Pool_Batch": + ksize = node_inst.get_nodeattr("KernelSize") + else: + raise Exception("Undefined edge case for %s" % op_type) + if ksize != 1: # pointwise vvau/pool lack a SWU + raise Exception( + "Expected SWU on DW op input, found " + swu_node.op_type + ) elif op_type in simd_ops: if op_type == "ConvolutionInputGenerator": depthwise = node_inst.get_nodeattr("depthwise") diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py index 5265835dd2530a5c93ceefbef629a43d6f33de52..62b9bfb1969114ad858f50cb7eb0137aa12135ed 100644 --- a/src/finn/transformation/fpgadataflow/template_driver.py +++ b/src/finn/transformation/fpgadataflow/template_driver.py @@ -79,7 +79,10 @@ io_shape_dict = { "ishape_packed" : $INPUT_SHAPE_PACKED$, "oshape_packed" : $OUTPUT_SHAPE_PACKED$, "input_dma_name" : $INPUT_DMA_NAME$, - "number_of_external_weights": $EXT_WEIGHT_NUM$ + "output_dma_name" : $OUTPUT_DMA_NAME$, + "number_of_external_weights": $EXT_WEIGHT_NUM$, + "num_inputs" : $NUM_INPUTS$, + "num_outputs" : $NUM_OUTPUTS$, } if __name__ == "__main__": @@ -88,8 +91,8 @@ if __name__ == "__main__": parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="$PLATFORM$") parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1) parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit") - parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy") - parser.add_argument('--outputfile', help='name of output npy file (i.e. "output.npy")', default="output.npy") + parser.add_argument('--inputfile', help='name(s) of input npy file(s) (i.e. "input.npy")', nargs="*", type=str, default=["input.npy"]) + parser.add_argument('--outputfile', help='name(s) of output npy file(s) (i.e. "output.npy")', nargs="*", type=str, default=["output.npy"]) parser.add_argument('--runtime_weight_dir', help='path to folder containing runtime-writable .dat weights', default="runtime_weights/") # parse arguments args = parser.parse_args() @@ -111,16 +114,13 @@ if __name__ == "__main__": # for the remote execution the data from the input npy file has to be loaded, # packed and copied to the PYNQ buffer if exec_mode == "execute": - # remove old output file to prevent reusing old output - # in case execution fails - try: - os.remove(outputfile) - except FileNotFoundError: - pass - # load desired input .npy file - ibuf_normal = np.load(inputfile) + # load desired input .npy file(s) + ibuf_normal = [] + for ifn in inputfile: + ibuf_normal.append(np.load(ifn)) obuf_normal = accel.execute(ibuf_normal) - np.save(outputfile, obuf_normal) + for o, obuf in enumerate(obuf_normal): + np.save(outputfile[o], obuf) elif exec_mode == "throughput_test": # remove old metrics file try: diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index 502b6f2bffd0d64980ae911d28b845ad90633a44..a2865321418343efbfdae12c111ba4334ecfee28 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -43,7 +43,6 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA -from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.general import ( GiveReadableTensorNames, @@ -207,7 +206,10 @@ class VitisLink(Transformation): # has axis, aximm and axilite # everything else is axis-only # assume only one connection from each ip to the next - producer = model.find_producer(node.input[0]) + if len(node.input) == 0: + producer = None + else: + producer = model.find_producer(node.input[0]) consumer = model.find_consumers(node.output[0]) # define kernel instances # name kernels connected to graph inputs as idmaxx @@ -223,6 +225,7 @@ class VitisLink(Transformation): else: instance_names[node.name] = node.name config.append("nk=%s:1:%s" % (node.name, instance_names[node.name])) + sdp_node.set_nodeattr("instance_name", instance_names[node.name]) # explicitly assign SLRs if the slr attribute is not -1 node_slr = sdp_node.get_nodeattr("slr") if node_slr != -1: @@ -375,6 +378,7 @@ class VitisBuild(Transformation): enable_debug=False, floorplan_file=None, enable_link=True, + partition_model_dir=None, ): super().__init__() self.fpga_part = fpga_part @@ -384,6 +388,7 @@ class VitisBuild(Transformation): self.enable_debug = enable_debug self.floorplan_file = floorplan_file self.enable_link = enable_link + self.partition_model_dir = partition_model_dir def apply(self, model): _check_vitis_envvars() @@ -398,7 +403,9 @@ class VitisBuild(Transformation): model = model.transform(Floorplan(floorplan=self.floorplan_file)) - model = model.transform(CreateDataflowPartition()) + model = model.transform( + CreateDataflowPartition(partition_model_dir=self.partition_model_dir) + ) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) @@ -439,6 +446,4 @@ class VitisBuild(Transformation): # set platform attribute for correct remote execution model.set_metadata_prop("platform", "alveo") - # create driver - model = model.transform(MakePYNQDriver(platform="alveo")) return (model, False) diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index 1e2830356fe0133038caaa1dbc43f97ca98378d1..cf712b38054c78c6e414ad914ab67378daec5d12 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -308,8 +308,8 @@ class Absorb1BitMulIntoConv(Transformation): class AbsorbTransposeIntoMultiThreshold(Transformation): - """Change (NCHWTranspose -> MultiThreshold -> NHWCTranspose) to (MultiThreshold) - with NHWC mode. For (NCHWTranspose -> MultiThreshold) move Transpose past MT.""" + """For (NCHWTranspose -> MultiThreshold) move Transpose past MultiThreshold + and set its data_layout mode to NHWC.""" def apply(self, model): graph = model.graph @@ -321,43 +321,43 @@ class AbsorbTransposeIntoMultiThreshold(Transformation): perms = list(get_by_name(n.attribute, "perm").ints) if perms == [0, 3, 1, 2]: mt_cand = model.find_consumer(n.output[0]) - if mt_cand.op_type == "MultiThreshold" and not model.is_fork_node( - mt_cand + if ( + mt_cand is not None + and mt_cand.op_type == "MultiThreshold" + # and not model.is_fork_node(mt_cand) ): - final_t_cand = model.find_consumer(mt_cand.output[0]) - if final_t_cand.op_type == "Transpose": - perms = list( - get_by_name(final_t_cand.attribute, "perm").ints - ) - if perms == [0, 2, 3, 1]: - mt = getCustomOp(mt_cand) - mt.set_nodeattr("data_layout", "NHWC") - # get rid of tranpose nodes, wire MT directly - mt_cand.input[0] = n.input[0] - mt_cand.output[0] = final_t_cand.output[0] - graph.node.remove(n) - graph.node.remove(final_t_cand) - graph_modified = True + final_t_cands = model.find_consumers(mt_cand.output[0]) + mt = getCustomOp(mt_cand) + mt.set_nodeattr("data_layout", "NHWC") + # get rid of first tranpose node + mt_cand.input[0] = n.input[0] + graph.node.remove(n) + # fix output shape for MultiThreshold + mt_orig_oshape = model.get_tensor_shape(mt_cand.output[0]) + mt_ishape = model.get_tensor_shape(mt_cand.input[0]) + model.set_tensor_shape(mt_cand.output[0], mt_ishape) + # re-insert Transpose behind MultiThreshold + transpose_output = model.make_new_valueinfo_name() + new_transpose = oh.make_node( + "Transpose", + [mt_cand.output[0]], + [transpose_output], + perm=[0, 3, 1, 2], + ) + graph.node.insert(node_ind + 1, new_transpose) + if final_t_cands is not None: + # rewire next nodes' inputs + for final_t_cand in final_t_cands: + final_t_cand.input[0] = transpose_output else: - mt = getCustomOp(mt_cand) - mt.set_nodeattr("data_layout", "NHWC") - # get rid of first tranpose node - mt_cand.input[0] = n.input[0] - graph.node.remove(n) - # fix output shape for MultiThreshold - mt_ishape = model.get_tensor_shape(mt_cand.input[0]) + # replace graph top-level output + get_by_name( + model.graph.output, mt_cand.output[0] + ).name = transpose_output model.set_tensor_shape(mt_cand.output[0], mt_ishape) - # re-insert Transpose behind MultiThreshold - transpose_output = model.make_new_valueinfo_name() - new_transpose = oh.make_node( - "Transpose", - [mt_cand.output[0]], - [transpose_output], - perm=[0, 3, 1, 2], - ) - graph.node.insert(node_ind + 1, new_transpose) - final_t_cand.input[0] = transpose_output - graph_modified = True + # set value_info shape for transpose output + model.set_tensor_shape(transpose_output, mt_orig_oshape) + graph_modified = True if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified) @@ -531,11 +531,20 @@ class AbsorbConsecutiveTransposes(Transformation): # TODO implement this to allow for forks as producers consumers = model.find_direct_successors(next_node) prod = model.find_producer(n.input[0]) - for cons in consumers: - for cons_in in cons.input: - if cons_in == next_node.output[0]: - prod.output[0] = cons_in - break + if prod is not None: + for cons in consumers: + for cons_in in cons.input: + if cons_in == next_node.output[0]: + prod.output[0] = cons_in + break + else: + # n.input[0] is top-level graph input + # wire consumers directly to that + for cons in consumers: + for i, iname in enumerate(cons.input): + if iname == next_node.output[0]: + cons.input[i] = n.input[0] + # remove both transposes graph.node.remove(n) graph.node.remove(next_node) @@ -544,3 +553,81 @@ class AbsorbConsecutiveTransposes(Transformation): if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class AbsorbTransposeIntoResize(Transformation): + """For (NCHWTranspose -> Resize) move Transpose past Resize and + change the Resize node's attributes accordingly.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Transpose" and not model.is_fork_node(node): + perms = list(get_by_name(node.attribute, "perm").ints) + if perms == [0, 3, 1, 2]: + mt_cand = model.find_consumer(node.output[0]) + if mt_cand is not None and mt_cand.op_type == "Resize": + mode = get_by_name(mt_cand.attribute, "mode").s.decode("ascii") + # skip if mode is not nearest + if mode != "nearest": + continue + # if sizes specified, turn into scales + if len(mt_cand.input) > 3: + sizes = model.get_initializer(mt_cand.input[3]) + else: + sizes = None + if sizes is not None: + ishape = model.get_tensor_shape(mt_cand.input[0]) + ns, cs, hs, ws = sizes / np.asarray(ishape) + model.set_initializer( + mt_cand.input[2], np.asarray([ns, cs, hs, ws]) + ) + mt_cand.input.remove(mt_cand.input[3]) + # scales already specified, transpose indices to NHWC + scales = model.get_initializer(mt_cand.input[2]) + assert scales is not None + ns, cs, hs, ws = scales + model.set_initializer( + mt_cand.input[2], np.asarray([ns, hs, ws, cs]) + ) + # get rid of first tranpose node + mt_cand.input[0] = node.input[0] + graph.node.remove(node) + is_last_node = mt_cand.output[0] in [ + x.name for x in model.graph.output + ] + + new_tensor_name = model.make_new_valueinfo_name() + if is_last_node: + trans_input = new_tensor_name + trans_output = mt_cand.output[0] + else: + trans_input = mt_cand.output[0] + trans_output = new_tensor_name + # fix tensor shapes for Resize and Transpose + # n, c, h, w = model.get_tensor_shape(mt_cand.input[0]) + n, c, hx, wx = model.get_tensor_shape(mt_cand.output[0]) + model.set_tensor_shape(trans_input, (n, hx, wx, c)) + model.set_tensor_shape(trans_output, (n, c, hx, wx)) + # re-insert Transpose behind Resize + new_transpose = oh.make_node( + "Transpose", + [trans_input], + [trans_output], + perm=[0, 3, 1, 2], + ) + graph.node.insert(node_ind + 1, new_transpose) + # rewire nodes + final_t_cands = model.find_consumers(mt_cand.output[0]) + if final_t_cands is not None: + # rewire next nodes' inputs + for final_t_cand in final_t_cands: + final_t_cand.input[0] = trans_output + mt_cand.output[0] = trans_input + graph_modified = True + if graph_modified: + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 1b22f474abe3f59ac91551efa3661b2612442776..cd44e115eed42d1c0529b86a49e8855ff7c492ce 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -594,11 +594,17 @@ class MoveScalarLinearPastInvariants(Transformation): nodes = [n for n in graph.node] for n in nodes: node_ind += 1 + is_nearest_neighbor_resample = False + if n.op_type == "Upsample" or n.op_type == "Resize": + # Extract mode and scales and input shape + mode = get_by_name(n.attribute, "mode").s.decode("ascii") + is_nearest_neighbor_resample = mode == "nearest" if ( n.op_type == "GlobalAveragePool" or n.op_type == "Reshape" or n.op_type == "Transpose" or n.op_type == "Flatten" + or is_nearest_neighbor_resample ): in0 = n.input[0] if in0 is None: diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 00a9fa721a320a8b70ee913e878955b9caddc3bf..6790485ceab373cd539727aefc59fa8999b3b192 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -63,6 +63,7 @@ from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim @@ -521,14 +522,6 @@ class TestEnd2End: topology, wbits, abits, return_topk=1 ) y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy) - model = ModelWrapper(rtlsim_chkpt) - perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim") - # warnings.warn("Estimated & rtlsim performance: " + str(perf)) - # for (k, v) in perf.items(): - # update_dashboard_data(topology, wbits, abits, k, v) - update_dashboard_data( - topology, wbits, abits, "cycles_rtlsim", perf["cycles_rtlsim"] - ) assert np.isclose(y, output_tensor_npy).all() @pytest.mark.slow @@ -541,12 +534,17 @@ class TestEnd2End: model = load_test_checkpoint_or_skip(prev_chkpt_name) n_nodes = len(model.graph.node) perf_est = model.analysis(dataflow_performance) - latency = int(model.get_metadata_prop("cycles_rtlsim")) + ret_b1 = throughput_test_rtlsim(model, batchsize=1) + latency = int(ret_b1["cycles"]) cycles_per_sample_est = perf_est["max_cycles"] batchsize = 2 * n_nodes ret = throughput_test_rtlsim(model, batchsize=batchsize) res_cycles = ret["cycles"] est_cycles = latency + cycles_per_sample_est * batchsize + # warnings.warn("Estimated & rtlsim performance: " + str(perf)) + # for (k, v) in perf.items(): + # update_dashboard_data(topology, wbits, abits, k, v) + update_dashboard_data(topology, wbits, abits, "cycles_rtlsim", latency) assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15 @pytest.mark.slow @@ -588,9 +586,22 @@ class TestEnd2End: update_dashboard_data(topology, wbits, abits, "board", cfg["board"]) model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind)) + @pytest.mark.slow + @pytest.mark.vivado + @pytest.mark.vitis + @pytest.mark.parametrize("kind", ["zynq", "alveo"]) + def test_make_pynq_driver(self, topology, wbits, abits, kind): + if kind == "alveo" and ("VITIS_PATH" not in os.environ): + pytest.skip("VITIS_PATH not set") + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + kind) + model = load_test_checkpoint_or_skip(prev_chkpt_name) + kind_to_driver_platform = {"zynq": "zynq-iodma", "alveo": "alveo"} + model = model.transform(MakePYNQDriver(kind_to_driver_platform[kind])) + model.save(get_checkpoint_name(topology, wbits, abits, "driver_" + kind)) + @pytest.mark.parametrize("kind", ["zynq", "alveo"]) def test_deploy(self, topology, wbits, abits, kind): - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + kind) + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "driver_" + kind) model = load_test_checkpoint_or_skip(prev_chkpt_name) cfg = get_build_env(kind, target_clk_ns) if cfg["ip"] == "": diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 1289b02636f030397075a9f580ed0977cd465a88..760c77ea406386ce4886b21bcb042808984dcb7f 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -198,6 +198,7 @@ def test_end2end_mobilenet_lowering(): ) model = model.transform(LowerConvsToMatMul()) model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py index 37a1c8d8486a535c8ff87f4b06905b3059bba35a..3357ee6d6c1e540818549f2d0df8b8554690ca3c 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py @@ -70,6 +70,7 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): model = model.transform(LowerConvsToMatMul()) model = model.transform(MakeMaxPoolNHWC()) model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) model = model.transform(InferDataLayouts()) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index b87241de56870cad70d08583b24292e0da91109e..012bc3e2e140c3fa63729584629613e3046f8838 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -43,6 +43,7 @@ from finn.custom_op.registry import getCustomOp from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim @@ -216,6 +217,7 @@ def test_runtime_thresholds_single_layer(): old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing + model = model.transform(InsertFIFO(True)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py new file mode 100644 index 0000000000000000000000000000000000000000..f7e06adb816cfa664187f39a9567d4f742e4043b --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py @@ -0,0 +1,201 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import os +import torch +from brevitas.export import FINNManager +from torch import nn + +import finn.core.onnx_exec as oxe +import finn.transformation.streamline.absorb as absorb +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.base import Transformation +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.convert_to_hls_layers import InferUpsample +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.make_input_chanlast import MakeInputChannelsLast + +tmpdir = os.environ["FINN_BUILD_DIR"] + + +class ForceDataTypeForTensors(Transformation): + """ + Forces a certain datatype for all tensors in a model. + """ + + def __init__(self, dType=DataType.INT8): + super().__init__() + self._dType = dType + + def apply(self, model): + graph = model.graph + for n in graph.node: + for inp in n.input: + model.set_tensor_datatype(inp, self._dType) + for inp in n.output: + model.set_tensor_datatype(inp, self._dType) + + return model, False + + +_to_chan_last_args = (0, 2, 3, 1) +_to_chan_first_args = (0, 3, 1, 2) + + +class TransposeUpsampleIO(Transformation): + """ + Converts the inputs outputs for all Upsample and Resize nodes + from NCHW to NHWC. + """ + + def apply(self, model): + graph = model.graph + for n in graph.node: + if n.op_type == "Upsample" or n.op_type == "Resize": + # Set input shape + inp = n.input[0] + NCHW_shape = model.get_tensor_shape(inp) + NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] + model.set_tensor_shape(inp, NHWC_shape) + # Set output shape + out = n.output[0] + NCHW_shape = model.get_tensor_shape(out) + NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] + model.set_tensor_shape(out, NHWC_shape) + return model, False + + +class PyTorchTestModel(nn.Module): + def __init__(self, upscale_factor=2): + super(PyTorchTestModel, self).__init__() + self.m = nn.Upsample( + scale_factor=upscale_factor, + mode="nearest", + ) + + def forward(self, x): + x = self.m(x) + return x + + +# param datatype +@pytest.mark.parametrize("dt", [DataType.INT8]) +# Width/height of square input feature map +@pytest.mark.parametrize("IFMDim", [3, 5]) +# upscaling factor +@pytest.mark.parametrize("scale", [2, 3]) +# Number of input/output channels +@pytest.mark.parametrize("NumChannels", [4]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode): + atol = 1e-3 + # Create the test model and inputs for it + torch_model = PyTorchTestModel(upscale_factor=scale) + input_shape = (1, NumChannels, IFMDim, IFMDim) + test_in = torch.arange(0, np.prod(np.asarray(input_shape))) + # Limit the input to values valid for the given datatype + test_in %= dt.max() - dt.min() + 1 + test_in += dt.min() + # Additionally make sure we always start with 0, for convenience purposes. + test_in = torch.roll(test_in, dt.min()) + test_in = test_in.view(*input_shape).type(torch.float32) + + # Get golden PyTorch and ONNX inputs + golden_torch_float = torch_model(test_in) + export_path = f"{tmpdir}/Upsample_exported.onnx" + FINNManager.export( + torch_model, input_shape=input_shape, export_path=export_path, opset_version=11 + ) + model = ModelWrapper(export_path) + input_dict = {model.graph.input[0].name: test_in.numpy().astype(np.int32)} + input_dict = {model.graph.input[0].name: test_in.numpy()} + golden_output_dict = oxe.execute_onnx(model, input_dict, True) + golden_result = golden_output_dict[model.graph.output[0].name] + + # Make sure PyTorch and ONNX match + pyTorch_onnx_match = np.isclose(golden_result, golden_torch_float).all() + assert pyTorch_onnx_match, "ONNX and PyTorch upsampling output don't match." + + # Prep model for execution + model = ModelWrapper(export_path) + # model = model.transform(TransposeUpsampleIO()) + model = model.transform(MakeInputChannelsLast()) + model = model.transform(InferDataLayouts()) + model = model.transform(absorb.AbsorbTransposeIntoResize()) + model = model.transform(InferShapes()) + model = model.transform(ForceDataTypeForTensors(dType=dt)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(InferUpsample()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + # Check that all nodes are UpsampleNearestNeighbour_Batch nodes + for n in model.get_finn_nodes(): + node_check = n.op_type == "UpsampleNearestNeighbour_Batch" + assert node_check, "All nodes should be UpsampleNearestNeighbour_Batch nodes." + + # Prep sim + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 10)) + model = model.transform(HLSSynthIP()) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # Run sim + test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) + input_dict = {model.graph.input[0].name: test_in_transposed} + output_dict = oxe.execute_onnx(model, input_dict, True) + test_result = output_dict[model.graph.output[0].name] + output_matches = np.isclose(golden_result, test_result, atol=atol).all() + + if exec_mode == "cppsim": + assert output_matches, "Cppsim output doesn't match ONNX/PyTorch." + elif exec_mode == "rtlsim": + assert output_matches, "Rtlsim output doesn't match ONNX/PyTorch." diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 73b1315592af79145e1b7c6f147b3ede7e066bce..706d11114f2f08df700efd40afb8dea218efbf42 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -36,8 +36,8 @@ from finn.core.rtlsim_exec import rtlsim_exec from finn.custom_op.registry import getCustomOp from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.general import GiveUniqueNodeNames from finn.util.basic import gen_finn_dt_tensor from finn.util.create import hls_random_mlp_maker @@ -78,11 +78,11 @@ def test_runtime_weights_single_layer(): os.remove("old_weights.dat") old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) + model = model.transform(InsertFIFO(True)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) - model = model.transform(PrepareRTLSim()) model.set_metadata_prop("exec_mode", "rtlsim") in_tensor = np.asarray(range(mw), dtype=np.float32) # add two copies of the input tensor as the first one is just used to diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py index f268611c296687987fffe32293b0454109bc7db4..8f4d57d3f84dd5c5a167b0e35b775def8ed27c5d 100644 --- a/tests/fpgadataflow/test_set_folding.py +++ b/tests/fpgadataflow/test_set_folding.py @@ -98,7 +98,8 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): model.set_tensor_datatype("outp", adt) for i in range(1, nnodes + 1): - model.graph.value_info.append(tensors[i]) + if tensors[i].name != "outp": + model.graph.value_info.append(tensors[i]) model.set_initializer("weights_" + str(i - 1), W) model.set_initializer("thresh_" + str(i - 1), T) model.set_tensor_datatype("weights_" + str(i - 1), wdt)