diff --git a/.gitignore b/.gitignore index 8b3166a44070a4575aac86c445c4504b594cda08..d7ee7e014a0c175a8a88060f2aa320efeb501ddc 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,6 @@ MANIFEST # SSH key dir mounted into Docker /ssh_keys/ + +# PYNQ board files +/board_files/ diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci index 7d5772d9f5118d1f1238dd14a6b57a1b4fd5004d..0d122133a6446cb77160c9447e16ff13d4d4b9c5 100644 --- a/docker/Dockerfile.finn_ci +++ b/docker/Dockerfile.finn_ci @@ -37,7 +37,7 @@ RUN apt-get update RUN apt-get -y upgrade RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev RUN apt-get install -y verilator zsh -RUN apt-get -y install sshpass +RUN apt-get -y install sshpass wget unzip RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config # cloning dependency repos diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev index 8c1502eb4a1941061bd58e6f9a18106f98f259e2..db49dceb2d06670dfc43059d3a4fa6160a8ded58 100644 --- a/docker/Dockerfile.finn_dev +++ b/docker/Dockerfile.finn_dev @@ -43,19 +43,20 @@ RUN apt-get update RUN apt-get -y upgrade RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev RUN apt-get install -y verilator nano zsh rsync -RUN apt-get -y install sshpass +RUN apt-get -y install sshpass wget unzip RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config COPY requirements.txt . RUN pip install -r requirements.txt RUN rm requirements.txt -RUN pip install jupyter -RUN pip install matplotlib -RUN pip install pytest-dependency -RUN pip install sphinx -RUN pip install sphinx_rtd_theme -RUN pip install pytest-xdist -RUN pip install pytest-parallel +RUN pip install jupyter==1.0.0 +RUN pip install matplotlib==3.3.1 --ignore-installed certifi +RUN pip install pytest-dependency==0.5.1 +RUN pip install sphinx==3.1.2 +RUN pip install sphinx_rtd_theme==0.5.0 +RUN pip install pytest-xdist==2.0.0 +RUN pip install pytest-parallel==0.1.0 +RUN pip install netron==4.4.7 # switch user RUN groupadd -g $GID $GNAME @@ -80,19 +81,6 @@ RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld # oh-my-xilinx RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx -# netron -RUN git clone https://github.com/lutzroeder/netron.git /workspace/netron - -# build and install netron -USER root -RUN curl -sL https://deb.nodesource.com/setup_12.x | bash - -RUN apt-get install -y nodejs -WORKDIR /workspace/netron -RUN git checkout 376e9d33733a3eacfe3c432808fd46e6cd1460cb -RUN npm install -RUN python setup.py build -RUN pip install /workspace/netron -USER $UNAME # for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host # at /workspace/finn -- see run-docker.sh for an example of how to do this. diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 99ad35cd13ef1ca442868f7a7c94154b63c65a5a..7da53140cb2c94ca4abe100499d0b533589b71fc 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -16,10 +16,9 @@ BREVITAS_COMMIT=172e423164402a07826877fa9730063bee10a208 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f -PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d +PYNQSHELL_COMMIT=bf281fc3a44eca29efbcbefd63f1196d82c7c255 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada - gecho "Setting up known-good commit versions for FINN dependencies" # Brevitas gecho "brevitas @ $BREVITAS_COMMIT" @@ -57,4 +56,19 @@ if [ ! -z "$VITIS_PATH" ];then export XILINX_VITIS=$VITIS_PATH source $VITIS_PATH/settings64.sh fi + +# download PYNQ board files if not already there +if [ ! -d "/workspace/finn/board_files" ]; then + gecho "Downloading PYNQ board files for Vivado" + wget -q https://github.com/cathalmccabe/pynq-z1_board_files/raw/master/pynq-z1.zip + wget -q https://d2m32eurp10079.cloudfront.net/Download/pynq-z2.zip + unzip -q pynq-z1.zip + unzip -q pynq-z2.zip + mkdir /workspace/finn/board_files + mv pynq-z1/ board_files/ + mv pynq-z2/ board_files/ + rm pynq-z1.zip + rm pynq-z2.zip +fi + exec "$@" diff --git a/docker/quicktest.sh b/docker/quicktest.sh index 49b7886836ac4e45dad856dfcd49223276bd831a..b06feccdc578a59c8ef00531871e1211c2a407e5 100755 --- a/docker/quicktest.sh +++ b/docker/quicktest.sh @@ -3,20 +3,24 @@ : ${PYTEST_PARALLEL=auto} cd $FINN_ROOT - # check if command line argument is empty or not present if [ -z $1 ]; then echo "Running quicktest: not (vivado or slow) with pytest-xdist" - python setup.py test --addopts "-m 'not (vivado or slow)' --dist=loadfile -n $PYTEST_PARALLEL" + python setup.py test --addopts "-m 'not (vivado or slow or vitis)' --dist=loadfile -n $PYTEST_PARALLEL" elif [ $1 = "main" ]; then echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist" - python setup.py test --addopts "-k not (rtlsim or end2end) --dist=loadfile -n $PYTEST_PARALLEL" + python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL" elif [ $1 = "rtlsim" ]; then echo "Running rtlsim test suite with pytest-parallel" python setup.py test --addopts "-k rtlsim --workers $PYTEST_PARALLEL" elif [ $1 = "end2end" ]; then echo "Running end2end test suite with no parallelism" python setup.py test --addopts "-k end2end" +elif [ $1 = "full" ]; then + echo "Running full test suite, each step with appropriate parallelism" + $0 main; + $0 rtlsim; + $0 end2end; else echo "Unrecognized argument to quicktest.sh" fi diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 323692897800d45c6e6cf55b688a2c7b2b9a5277..8a20dad0e47b9458989039184cfa0e5d01d48aa2 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -92,7 +92,14 @@ These are summarized below: * `JUPYTER_PORT` (default 8888) changes the port for Jupyter inside Docker * `NETRON_PORT` (default 8081) changes the port for Netron inside Docker * `NUM_DEFAULT_WORKERS` (default 1) specifies the degree of parallelization for the transformations that can be run in parallel -* `PYNQ_BOARD` specifies the type of PYNQ board used (Pynq-Z1, Pynq-Z2, Ultra96, ZCU104) for the test suite +* `PYNQ_BOARD` specifies the type of PYNQ board used (see "supported hardware" below) for the test suite * `PYNQ_IP` and `PYNQ_PORT` specify ip address and port number to access the PYNQ board * `PYNQ_USERNAME` and `PYNQ_PASSWORD` specify the PYNQ board access credentials for the test suite * `PYNQ_TARGET_DIR` specifies the target dir on the PYNQ board for the test suite + +Supported Hardware +=================== +**End-to-end support including driver:** For quick deployment, FINN targets boards supported by `PYNQ <https://pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards. + +**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials) and wire it up to your FPGA design. + diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml index 6b728c0555a4889b8e76d5759233d1109a3002bd..7910a8284dad3674b8665136506a60c498e0547f 100644 --- a/finn-rtllib/memstream/component.xml +++ b/finn-rtllib/memstream/component.xml @@ -1051,6 +1051,7 @@ <xilinx:family xilinx:lifeCycle="Beta">azynq</xilinx:family> <xilinx:family xilinx:lifeCycle="Beta">zynquplus</xilinx:family> <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family> + <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family> </xilinx:supportedFamilies> <xilinx:taxonomies> <xilinx:taxonomy>/UserIP</xilinx:taxonomy> diff --git a/notebooks/end2end_example/cnv_end2end_example.ipynb b/notebooks/end2end_example/cnv_end2end_example.ipynb index ce8c9decf4aaa6b7be2e556b6053abf380d0d373..74efa67d16616f64b21d84a8ef328ceaf2f3ce09 100644 --- a/notebooks/end2end_example/cnv_end2end_example.ipynb +++ b/notebooks/end2end_example/cnv_end2end_example.ipynb @@ -574,7 +574,7 @@ "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn\")\n", "\n", "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")\n", - "model = model.transform(MakePYNQDriver())\n", + "model = model.transform(MakePYNQDriver(platform="zynq"))\n", "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n", "model.save(build_dir + \"/end2end_cnv_w1a1_pynq_deploy.onnx\")" ] diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb index c84efc964b1f57b7ed385521fc5214fdc2396590..c388feca2340792c3535dba3fb3cf5e7220adf3c 100644 --- a/notebooks/end2end_example/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/tfc_end2end_example.ipynb @@ -730,7 +730,7 @@ " 'ip_path': ('s', False, ''),\n", " 'ip_vlnv': ('s', False, ''),\n", " 'exec_mode': ('s', False, ''),\n", - " 'sim_cycles': ('i', False, 0),\n", + " 'cycles_rtlsim': ('i', False, 0),\n", " 'rtlsim_trace': ('s', False, ''),\n", " 'res_estimate': ('s', False, ''),\n", " 'res_hls': ('s', False, ''),\n", @@ -1422,7 +1422,7 @@ "source": [ "from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver\n", "model = ModelWrapper(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")\n", - "model = model.transform(MakePYNQDriver())" + "model = model.transform(MakePYNQDriver(platform="zynq"))" ] }, { diff --git a/requirements.txt b/requirements.txt index b15d86ed89f7b0e76b772ce42aba6481937310b0..4aa1cbe3484a3447851879d7da9ce9d48b066592 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,11 @@ -bitstring -docrep -future +bitstring==3.1.7 +docrep==0.2.7 +future==0.18.2 numpy==1.18.0 onnx==1.6.0 onnxruntime==1.2.0 -pre-commit -pyverilator -scipy -sphinx -toposort -vcdvcd -wget +pre-commit==2.6.0 +scipy==1.5.2 +toposort==1.5 +vcdvcd==1.0.5 +wget==3.2 diff --git a/setup.cfg b/setup.cfg index 1d7dcf247636b486e35d6320669eae706c2b7a72..7729d0949ee133e06242905afab31708e79ebf04 100644 --- a/setup.cfg +++ b/setup.cfg @@ -104,6 +104,7 @@ addopts = markers = slow: marks tests as slow (deselect with '-m "not slow"') vivado: mark tests that require Vivado or Vivado HLS + vitis: mark tests that require Vitis norecursedirs = dist build diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..201333aebdb3fc1d15464389e37326dcaf6848e0 --- /dev/null +++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import finn.custom_op.registry as registry +from finn.util.fpgadataflow import is_fpgadataflow_node + + +def exp_cycles_per_layer(model): + """Estimates the number of cycles per sample for dataflow layers in the given model. + Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames + transformation) prior to calling this analysis pass to ensure all nodes are + visible in the results. + + Returns {node name : cycle estimation}.""" + + cycle_dict = {} + for node in model.graph.node: + if is_fpgadataflow_node(node) is True: + op_type = node.op_type + inst = registry.custom_op[op_type](node) + cycle_dict[node.name] = inst.get_exp_cycles() + + return cycle_dict diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py index ad30282d93034f8d043a05a2172790349c31ec83..03b31b9c1ec51b45e17152d35d5824b6137ab4a2 100644 --- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py +++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py @@ -35,6 +35,9 @@ from finn.util.fpgadataflow import is_fpgadataflow_node def hls_synth_res_estimation(model): """Extracts the FPGA resource results from the Vivado HLS synthesis estimates. + Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames + transformation) prior to calling this analysis pass to ensure all nodes are + visible in the results. Returns {node name : resources_dict}.""" diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py index 508c34aaed50f2935f4915cdcea29a3e92641b3c..9206f3f6fcd81de175babef54de990fe01c861e1 100644 --- a/src/finn/analysis/fpgadataflow/post_synth_res.py +++ b/src/finn/analysis/fpgadataflow/post_synth_res.py @@ -30,15 +30,23 @@ import os import xml.etree.ElementTree as ET from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.core.modelwrapper import ModelWrapper +from finn.custom_op.registry import getCustomOp -def post_synth_res(model): +def post_synth_res(model, override_synth_report_filename=None): """Extracts the FPGA resource results from the Vivado synthesis. + Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames + transformation) prior to calling this analysis pass to ensure all nodes are + visible in the results. Returns {node name : resources_dict}.""" res_dict = {} - synth_report_filename = model.get_metadata_prop("vivado_synth_rpt") + if override_synth_report_filename is not None: + synth_report_filename = override_synth_report_filename + else: + synth_report_filename = model.get_metadata_prop("vivado_synth_rpt") if os.path.isfile(synth_report_filename): tree = ET.parse(synth_report_filename) root = tree.getroot() @@ -50,7 +58,11 @@ def post_synth_res(model): raise Exception("Please run synthesis first") for node in model.graph.node: - if _is_fpgadataflow_node(node): + if node.op_type == "StreamingDataflowPartition": + sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model")) + sdp_res_dict = post_synth_res(sdp_model, synth_report_filename) + res_dict.update(sdp_res_dict) + elif _is_fpgadataflow_node(node): row = root.findall(".//*[@contents='%s']/.." % node.name) if row != []: node_dict = {} diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index c190059eceb0cc111477c84f843f4a9f9bf2f393..e52557573dab072709da4452f4e2d477e99b98c9 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -32,6 +32,9 @@ from finn.util.fpgadataflow import is_fpgadataflow_node def res_estimation(model): """Estimates the resources needed for the given model. + Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames + transformation) prior to calling this analysis pass to ensure all nodes are + visible in the results. Returns {node name : resource estimation}.""" diff --git a/src/finn/core/datatype.py b/src/finn/core/datatype.py index 222d11a8872f9be757fd60fbfa5f8abea683311a..df895a1ad446d6b2cc3ebb24f1179944f4cfe9ab 100644 --- a/src/finn/core/datatype.py +++ b/src/finn/core/datatype.py @@ -50,17 +50,69 @@ class DataType(Enum): UINT2 = auto() UINT3 = auto() UINT4 = auto() + UINT5 = auto() + UINT6 = auto() + UINT7 = auto() UINT8 = auto() + UINT9 = auto() + UINT10 = auto() + UINT11 = auto() + UINT12 = auto() + UINT13 = auto() + UINT14 = auto() + UINT15 = auto() UINT16 = auto() + UINT17 = auto() + UINT18 = auto() + UINT19 = auto() + UINT20 = auto() + UINT21 = auto() + UINT22 = auto() + UINT23 = auto() + UINT24 = auto() + UINT25 = auto() + UINT26 = auto() + UINT27 = auto() + UINT28 = auto() + UINT29 = auto() + UINT30 = auto() + UINT31 = auto() UINT32 = auto() + UINT64 = auto() BIPOLAR = auto() TERNARY = auto() INT2 = auto() INT3 = auto() INT4 = auto() + INT5 = auto() + INT6 = auto() + INT7 = auto() INT8 = auto() + INT9 = auto() + INT10 = auto() + INT11 = auto() + INT12 = auto() + INT13 = auto() + INT14 = auto() + INT15 = auto() INT16 = auto() + INT17 = auto() + INT18 = auto() + INT19 = auto() + INT20 = auto() + INT21 = auto() + INT22 = auto() + INT23 = auto() + INT24 = auto() + INT25 = auto() + INT26 = auto() + INT27 = auto() + INT28 = auto() + INT29 = auto() + INT30 = auto() + INT31 = auto() INT32 = auto() + INT64 = auto() FLOAT32 = auto() def bitwidth(self): diff --git a/src/finn/core/modelwrapper.py b/src/finn/core/modelwrapper.py index 646add188c5d475cf37ccd33cf24d29d61754ae1..98b234592ebe0c704fafd1eed980325d8566e7e2 100644 --- a/src/finn/core/modelwrapper.py +++ b/src/finn/core/modelwrapper.py @@ -36,6 +36,11 @@ from onnx import TensorProto import finn.util.basic as util import finn.util.onnx as onnxutil from finn.core.datatype import DataType +from finn.transformation.general import ( + RemoveUnusedTensors, + RemoveStaticGraphInputs, + SortGraph, +) class ModelWrapper: @@ -87,7 +92,7 @@ class ModelWrapper: """Runs given anaylsis_fxn on this model and return resulting dict.""" return analysis_fxn(self) - def transform(self, transformation, make_deepcopy=True): + def transform(self, transformation, make_deepcopy=True, cleanup=True): """Applies given Transformation repeatedly until no more changes can be made and returns a transformed ModelWrapper instance. @@ -101,6 +106,22 @@ class ModelWrapper: (transformed_model, model_was_changed) = transformation.apply( transformed_model ) + if cleanup: + transformed_model.cleanup() + return transformed_model + + def cleanup(self): + "Run cleanup transformations on the model." + transformed_model = self + cleanup_transforms = [ + RemoveUnusedTensors(), + RemoveStaticGraphInputs(), + SortGraph(), + ] + for trn in cleanup_transforms: + transformed_model = transformed_model.transform( + trn, cleanup=False, make_deepcopy=False + ) return transformed_model def check_compatibility(self): diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index 7c3123cd5eb29a54dc5cbfb912225ad3fdb0f219..0c01a48a07608dcd760447e8f569128f58d86f28 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -51,8 +51,20 @@ def execute_node(node, context, graph): if node.op_type == "StreamingDataflowPartition": sdp_node = getCustomOp(node) model = ModelWrapper(sdp_node.get_nodeattr("model")) - ret = execute_onnx(model, context, True) - context.update(ret) + inp_ctx = dict(filter(lambda x: x[0] in node.input, context.items())) + # input may have been renamed in partition + assert len(inp_ctx) == 1 + old_iname = node.input[0] + new_iname = model.graph.input[0].name + if old_iname != new_iname: + inp_ctx[new_iname] = inp_ctx[old_iname] + del inp_ctx[old_iname] + ret = execute_onnx(model, inp_ctx, False) + # output may have been renamed in partition + assert len(ret) == 1 + node_oname = node.output[0] + model_oname = model.graph.output[0].name + context[node_oname] = ret[model_oname] else: if node.domain == "finn": diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py index a533e4d36629f57f7c4a576570d75a1e051de5be..214358608c43a868f9ef414dcbf6eb33e3f45a5b 100644 --- a/src/finn/core/remote_exec.py +++ b/src/finn/core/remote_exec.py @@ -62,11 +62,15 @@ def remote_exec(model, execution_context): bash_command = ["/bin/bash", "-c", cmd] process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() + # set platform attribute for correct remote execution + platform = model.get_metadata_prop("platform") + assert platform in ["alveo", "zynq", "zynq-iodma"] cmd = ( "sshpass -p {} ssh {}@{} -p {} " '"cd {}/{}; echo "{}" | ' 'sudo -S python3.6 driver.py --exec_mode="execute" --batchsize=1" ' - '--bitfile="resizer.bit" --inputfile="input.npy" --outputfile="output.npy"' + '--bitfile="resizer.bit" --inputfile="input.npy" --outputfile="output.npy" ' + '--platform="{}" ' ).format( pynq_password, pynq_username, @@ -75,6 +79,7 @@ def remote_exec(model, execution_context): pynq_target_dir, deployment_folder, pynq_password, + platform, ) bash_command = ["/bin/bash", "-c", cmd] process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py index bb5b3075582b8e01e8eed95f709934302fcadb42..d83bcd3a75dd0d2fc02315c72784e57348901a04 100644 --- a/src/finn/core/rtlsim_exec.py +++ b/src/finn/core/rtlsim_exec.py @@ -102,7 +102,7 @@ def rtlsim_exec(model, execution_context): sim = PyVerilator(rtlsim_so, auto_eval=False) ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file) packed_output = ret[0] - model.set_metadata_prop("sim_cycles", str(ret[1])) + model.set_metadata_prop("cycles_rtlsim", str(ret[1])) # unpack output and put into context o_folded_tensor = rtlsim_output_to_npy( packed_output, None, o_dt, o_folded_shape, packedBits, targetBits @@ -171,7 +171,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True): no_change_count = no_change_count + 1 if len(outputs) == num_out_values: - sim_cycles = observation_count + cycles_rtlsim = observation_count output_observed = True if no_change_count == liveness_threshold: @@ -191,4 +191,4 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True): sim.flush_vcd_trace() sim.stop_vcd_trace() - return (outputs, sim_cycles) + return (outputs, cycles_rtlsim) diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py index 4444e7584f843cd0edb016b520d01d71e659b904..fbfe775e581e063b08e34b3096fd34f412b47d11 100644 --- a/src/finn/core/throughput_test.py +++ b/src/finn/core/throughput_test.py @@ -125,7 +125,7 @@ def throughput_test_rtlsim(model, batchsize=100): os.environ["LIVENESS_THRESHOLD"] = "-1" rtlsim_exec(model, ctx) # extract metrics - cycles = int(model.get_metadata_prop("sim_cycles")) + cycles = int(model.get_metadata_prop("cycles_rtlsim")) clk_ns = float(model.get_metadata_prop("clk_ns")) fclk_mhz = 1 / (clk_ns * 0.001) runtime_s = (cycles * clk_ns) * (10 ** -9) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index bc816f18c5f72338dc726e504182998f3f4430b7..65c898a8c453420ed96ca22715ef2595c5840288 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -82,7 +82,8 @@ class HLSCustomOp(CustomOp): "ip_path": ("s", False, ""), "ip_vlnv": ("s", False, ""), "exec_mode": ("s", False, ""), - "sim_cycles": ("i", False, 0), + "cycles_rtlsim": ("i", False, 0), + "cycles_estimate": ("i", False, 0), "rtlsim_trace": ("s", False, ""), "res_estimate": ("s", False, ""), "res_hls": ("s", False, ""), @@ -209,6 +210,12 @@ class HLSCustomOp(CustomOp): HLSCustomOp class but has to be filled by every node""" return 0 + def get_exp_cycles(self): + """Function for estimation of expected cycles for set folding, + is member function of HLSCustomOp class but has to be filled + by every node""" + return 0 + def code_generation_ipgen(self, model, fpgapart, clk): """Generates c++ code and tcl script for ip generation.""" node = self.onnx_node @@ -436,7 +443,7 @@ compilation transformations? no_change_count = no_change_count + 1 if len(outputs) == num_out_values: - self.set_nodeattr("sim_cycles", observation_count) + self.set_nodeattr("cycles_rtlsim", observation_count) output_observed = True if no_change_count == liveness_threshold: @@ -465,7 +472,7 @@ compilation transformations? trace_file = self.onnx_node.name + ".vcd" num_out_values = self.get_number_output_values() total_cycle_count = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file) - self.set_nodeattr("sim_cycles", total_cycle_count) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) def execute_node(self, context, graph): """Executes single node using cppsim or rtlsim.""" diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py index d73f22672e7163eef0738d067f951e90fe80a89f..14fb65739dab4208edd0c61bb7ca8ae2d114baab 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py @@ -170,6 +170,10 @@ class AddStreams_Batch(HLSCustomOp): def get_number_output_values(self): return np.prod(self.get_folded_output_shape()[:-1]) + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py index ad68a4bde29123b2498ac7789048bcd2e13bf3bc..d8e74a4d13043a741cf787477c51b63925b7aad8 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -224,6 +224,10 @@ class ChannelwiseOp_Batch(HLSCustomOp): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 3e40ad70208909551365c51324153859ccc79ceb..d33d6c963c0c55309f7f258c9ec1d7723e112282 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -177,6 +177,23 @@ class ConvolutionInputGenerator(HLSCustomOp): num_output_elems = np.prod(folded_oshape[:-1]) return num_output_elems + def get_exp_cycles(self): + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h + cycles_write_block = (ofm_dim * k * k * (ifm_ch / simd)) / mmv + cycles_read_block = stride * ifm_dim * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ifm_dim * k * (ifm_ch / simd) + ofm_dim * max_cycles + + return int(exp_cycles) + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index 0ce4379a2c41baa5bc009e9df7623d133ee89a09..15d55653b4e431dead885d75650b1500150d8775 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -36,6 +36,14 @@ class DownSampler(HLSCustomOp): stride = self.get_nodeattr("Stride") return int(np.floor((idim - 1) / stride) + 1) + def get_exp_cycles(self): + idim = self.get_nodeattr("ImgDim") + channels = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + batch_size = self.get_nodeattr("numInputVectors") + exp_cycles = channels / simd * batch_size * idim * idim + return int(exp_cycles) + def get_normal_input_shape(self): idim = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index e4762509fb6246bafa7441e194312d69ad585d1b..044cfddaab51a5f9bf7aa25e9123247b10de8529 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -164,6 +164,10 @@ class DuplicateStreams_Batch(HLSCustomOp): def get_number_output_values(self): return 2 * np.prod(self.get_folded_output_shape()[1:-1]) + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py index d326ae7dfc7830a0081c3b13233d67ef08b12eff..f9a9dc4340b18578550a9c453d90de86234d1cad 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -42,6 +42,14 @@ class FMPadding_Batch(HLSCustomOp): pad = self.get_nodeattr("Padding") return idim + pad + def get_exp_cycles(self): + odim = self.get_padded_odim() + channels = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + batch_size = self.get_nodeattr("numInputVectors") + exp_cycles = (channels / simd) * batch_size * odim * odim + return exp_cycles + def get_normal_input_shape(self): idim = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py index 83152dea6cc494b8464c78605399b21b38d48b80..1a75858880a072345ef942ca91feabf0bec9ab36 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py @@ -182,6 +182,13 @@ class GlobalAccPool_Batch(HLSCustomOp): def get_number_output_values(self): return np.prod(self.get_folded_output_shape()[1:-1]) + def get_exp_cycles(self): + # Channels/PE * batch size * idim * idim + Channels/PE + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + folds = int(ch / pe) + return np.prod(self.get_folded_input_shape()[:-1]) + folds + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index 05870b8d9d5d3a11bad7882c9a7d122f8cd34cf6..7d0374445d816f1e8d49ed92cf7aa67b024f9ac1 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -197,11 +197,13 @@ class IODMA(HLSCustomOp): def get_number_output_values(self): oshape = self.get_normal_output_shape() itype_bits = self.get_input_datatype().bitwidth() - intfw = self.get_nodeattr("intfWidth") + stream_width = self.get_nodeattr("streamWidth") nelems = np.prod(oshape) nbits = nelems * itype_bits - assert nbits % intfw == 0, "DMA: total transfer size must be word multiple" - ovalues = nbits // intfw + assert ( + nbits % stream_width == 0 + ), "DMA: total transfer size must be word multiple" + ovalues = nbits // stream_width return ovalues def global_includes(self): diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py index 801a634fdba1cd5e16c7c211175c1e7380bf0070..4a2fa6889ae0ebb94976d50b0fc8362d01a63bea 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -136,6 +136,16 @@ class Pool_Batch(HLSCustomOp): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[1:-1]) + def get_exp_cycles(self): + # (Channels * kernel * kernel) / PE * odim * odim * batch_size + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + k = self.get_nodeattr("KernelSize") + odim = self.get_nodeattr("OutImgDim") + batch_size = self.get_nodeattr("BatchSize") + exp_cycles = ((ifm_ch * k * k) / pe) * odim * odim * batch_size + return int(exp_cycles) + def get_instream_width(self): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 9c3bd3ac87b94f3e0ff11a2937bf5083aae614f6..181e04f7142053708cc5b2338a8078f6c9fc8303 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -39,6 +39,7 @@ from finn.custom_op.fpgadataflow import HLSCustomOp from finn.util.basic import ( interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, + calculate_matvec_accumulator_range, ) from finn.util.data_packing import ( npy_to_rtlsim_input, @@ -75,6 +76,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), "outputDataType": ("s", True, ""), + # FINN DataType for accumulator -- auto-computed and updated + "accDataType": ("s", False, "INT32"), # use xnor-popcount for binary weights/inputs, thus treating them # as bipolar "binaryXnorMode": ("i", False, 0), @@ -278,6 +281,17 @@ class StreamingFCLayer_Batch(HLSCustomOp): return c0 + c1 * (P * Q) * (W * A) + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + num_inp_vec = self.get_nodeattr("numInputVectors") + mh = self.get_nodeattr("MH") + mw = self.get_nodeattr("MW") + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv + return int(exp_cycles) + def get_input_datatype(self): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] @@ -433,6 +447,51 @@ class StreamingFCLayer_Batch(HLSCustomOp): ret = np.flip(ret, axis=-1) return ret + def minimize_accumulator_width(self, model): + weights = model.get_initializer(self.onnx_node.input[1]) + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + else: + thresholds = None + idt = self.get_input_datatype() + # calculate minimum and maximum values of accumulator + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + if thresholds is not None: + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # set threshold datatype (and accumulator datatype implicitly) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + # get range required by threshold values + tdt_min = min(acc_min, min_threshold) + tdt_max = max(acc_max, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(0 - tdt_max) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("accDataType", tdt.name) + else: + if acc_min < 0: + if abs(acc_min) > acc_max: + adt = DataType.get_smallest_possible(acc_min) + else: + adt = DataType.get_smallest_possible(0 - acc_max) + else: + adt = DataType.get_smallest_possible(acc_max) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + self.set_nodeattr("accDataType", adt.name) + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + return DataType[self.get_nodeattr("accDataType")] + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: @@ -594,7 +653,6 @@ class StreamingFCLayer_Batch(HLSCustomOp): thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - tdt = DataType.INT32 # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR @@ -604,8 +662,12 @@ class StreamingFCLayer_Batch(HLSCustomOp): bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - if inp_is_bipolar and wt_is_bipolar: - tdt = DataType.UINT32 + # get computed threshold datatype from attribute + tdt = DataType[self.get_nodeattr("accDataType")] + + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) thresholds_hls_code = numpy_to_hls_code( threshold_tensor, tdt, "thresholds", False, True ) diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py index 2344e12f7e87634c189563f9cde7b1c861a3606e..4c772358648f402467cee628afe410d7bce83ede 100644 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py @@ -95,6 +95,12 @@ class StreamingMaxPool_Batch(HLSCustomOp): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) + def get_exp_cycles(self): + # derived from StreamingMaxPool_Batch loop nest + k = self.get_nodeattr("PoolDim") + ifm_dim = self.get_nodeattr("ImgDim") + return ifm_dim * (ifm_dim + (ifm_dim / k)) + def get_instream_width(self): dt_bits = self.get_input_datatype().bitwidth() ifm_ch = self.get_nodeattr("NumChannels") diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 1da60a5124fa86b4336bae8fd1a587672f2f2e6f..319731df70d5bd1cb80d42932f08acdcec80c074 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -344,6 +344,7 @@ set_property supported_families { \ virtex7 Production \ virtexu Production \ virtexuplus Production \ + virtexuplusHBM Production \ zynq Production \ zynquplus Production \ aartix7 Production \ diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index fa33c70218fab16f106da45e296f0d59ae4ea606..562bab0f18990096f7364b3a4e2bcbbbf4ce2b58 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -215,6 +215,10 @@ class Thresholding_Batch(HLSCustomOp): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" @@ -279,7 +283,25 @@ class Thresholding_Batch(HLSCustomOp): thresholds = model.get_initializer(self.onnx_node.input[1]) threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - tdt = DataType.INT32 + + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(0 - tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + thresholds_hls_code = numpy_to_hls_code( threshold_tensor, tdt, "thresholds", False, True ) diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..942e4b25700d0c52c1bc5bcd81614a058342f178 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py @@ -0,0 +1,506 @@ +import os +import numpy as np + +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.util.basic import interleave_matrix_outer_dim_from_partitions +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + rtlsim_output_to_npy, +) + + +class Vector_Vector_Activate_Batch(HLSCustomOp): + """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "PE": ("i", True, 0), + "Dim": ("i", True, 0), + "Channels": ("i", True, 0), + "Kernel": ("i", True, 0), + "resType": ("s", True, ""), + "ActVal": ("i", False, 0), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # no-activation mode (produce accumulators) + "noActivation": ("i", False, 0), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_wmem(self): + """Calculates and returns WMEM.""" + ch = self.get_nodeattr("Channels") + k = self.get_nodeattr("Kernel") + pe = self.get_nodeattr("PE") + wmem = k * k * ch // pe + return wmem + + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + return ch // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt_name = self.get_input_datatype().name + exp_idt_name = self.get_nodeattr("inputDataType") + assert exp_idt_name == idt_name, "Bad input DataType for VVAU node" + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + i_bits = self.get_input_datatype().bitwidth() + in_width = i_bits * self.get_nodeattr("Channels") + return in_width + + def get_outstream_width(self): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_folded_input_shape(self): + k = self.get_nodeattr("Kernel") + sf = k * k + dim = self.get_nodeattr("Dim") + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + nf = ch // pe + folded_input_shape = tuple([1, dim, dim, sf * nf, pe]) + return folded_input_shape + + def get_folded_output_shape(self): + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + nf = ch // pe + dim = self.get_nodeattr("Dim") + folded_output_shape = tuple([1, dim, dim, nf, pe]) + return folded_output_shape + + def get_normal_input_shape(self): + dim = self.get_nodeattr("Dim") + ch = self.get_nodeattr("Channels") + k = self.get_nodeattr("Kernel") + normal_input_shape = tuple([1, dim, dim, k * k * ch]) + return normal_input_shape + + def get_normal_output_shape(self): + ch = self.get_nodeattr("Channels") + dim = self.get_nodeattr("Dim") + normal_output_shape = tuple([1, dim, dim, ch]) + return normal_output_shape + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + ch = self.get_nodeattr("Channels") + dim = self.get_nodeattr("Dim") + k = self.get_nodeattr("Kernel") + # currently FINN supports for vvau a batch size of 1 + batch_size = 1 + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = ((ch * k * k) / pe) * batch_size * (dim * dim) / mmv + return int(exp_cycles) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR + wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR + # fill in TSrcI and TWeightI + # TODO handle bipolar inputs + if inp_is_bipolar or wt_is_bipolar: + raise Exception("VVAU node doesn't support bipolar values yet.") + else: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + pe = self.get_nodeattr("PE") + ch = self.get_nodeattr("Channels") + k = self.get_nodeattr("Kernel") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + ch, + 1, + k, + k, + ), """Weights matrix doesn't + have expected shape (channels, 1, kernel_size, kernel_size)""" + ret = orig_weight_matrix + ret = ret.reshape(ch, k * k) + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + ret = ret.reshape(1, pe, wmem, 1) + return ret + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + tmem = self.calc_tmem() + assert ch % pe == 0, "Requirement Channels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + ret = orig_thres_matrix + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def generate_params(self, model, path): + # weights + weights = model.get_initializer(self.onnx_node.input[1]) + # convert weights into hlslib-compatible format + weight_tensor = self.get_hls_compatible_weight_tensor(weights) + wdt = self.get_weight_datatype() + code_gen_dir = path + + """Saves weights into params.h""" + weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True) + # write weights into params.h + f_weights = open("{}/params.h".format(code_gen_dir), "w") + + if wdt.bitwidth() != 1: + f_weights.write( + "const FixedPointWeights<1,{},{},{}> weights = ".format( + wdt.get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + else: + f_weights.write( + "const BinaryWeights<1,{},{}> weights = ".format( + self.get_nodeattr("PE"), self.calc_wmem() + ) + ) + f_weights.write(weight_hls_code) + f_weights.close() + + # save thresholds in thresh.h + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + if thresholds is not None: + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + tdt = DataType.INT32 + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds are not int" + thresholds_hls_code = numpy_to_hls_code( + threshold_tensor, tdt, "thresholds", False, True + ) + # write thresholds into thresh.h + f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") + tdt_hls = tdt.get_hls_datatype_str() + odt = self.get_output_datatype() + odt_hls = odt.get_hls_datatype_str() + f_thresh.write( + "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + threshold_tensor.shape[-1], + tdt_hls, + odt_hls, + self.get_nodeattr("ActVal"), + "std::less_equal<%s>" % tdt_hls, + ) + ) + f_thresh.write(thresholds_hls_code) + f_thresh.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception( + "Unexpected input found for Vector_Vector_Activate_Unit" + ) + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == self.get_folded_output_shape() + ), """Output shape is not as expected""" + # reshape output to have expected shape + oshape = self.get_normal_output_shape() + context[node.output[0]] = context[node.output[0]].reshape(*oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + idt = self.get_input_datatype() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + if self.calc_tmem() != 0: + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + dim = self.get_nodeattr("Dim") + numReps = 1 * dim * dim + self.code_gen_dict["$DEFINES$"] = [ + """#define Channels1 {}\n #define Kernel1 {}\n + #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format( + self.get_nodeattr("Channels"), + self.get_nodeattr("Kernel"), + self.get_nodeattr("PE"), + numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + node = self.onnx_node + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<Channels1, Kernel1, SIMD1, PE1, 1, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + node.op_type, + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + threshs, + self.get_nodeattr("resType"), + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + in_fifo_depth = self.get_nodeattr("inFIFODepth") + out_fifo_depth = self.get_nodeattr("outFIFODepth") + # insert depth pragmas only if specified + if in_fifo_depth != 0: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth + ) + if out_fifo_depth != 0: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=%d variable=out" % out_fifo_depth + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint<ch*prec> [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=1" + ) + ) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=3" + ) + ) diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index b172f54622f9779822dae2c6d6005edc8cab42cd..ecf2a711f17ac35c9bf8cb081fb4dc6d9bb6c01e 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -52,6 +52,9 @@ from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.quantavgpool2d import QuantAvgPool2d from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch +from finn.custom_op.fpgadataflow.vector_vector_activate_batch import ( + Vector_Vector_Activate_Batch, +) from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch from finn.custom_op.fpgadataflow.iodma import IODMA from finn.custom_op.debugmarker import DebugMarker @@ -79,6 +82,7 @@ custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["QuantAvgPool2d"] = QuantAvgPool2d custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch +custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch custom_op["IODMA"] = IODMA custom_op["DebugMarker"] = DebugMarker diff --git a/src/finn/custom_op/streamingdataflowpartition.py b/src/finn/custom_op/streamingdataflowpartition.py index b63326d676f4ded5ec1dd62f5cc7f02d7acb82ad..31cd38fea3c5a9e88084c3332d46aebdb065f800 100644 --- a/src/finn/custom_op/streamingdataflowpartition.py +++ b/src/finn/custom_op/streamingdataflowpartition.py @@ -36,7 +36,12 @@ class StreamingDataflowPartition(CustomOp): bitfile by itself.""" def get_nodeattr_types(self): - return {"model": ("s", True, "")} + return { + "model": ("s", True, ""), + "res_estimate": ("s", False, ""), + "res_hls": ("s", False, ""), + "res_synth": ("s", False, ""), + } def make_shape_compatible_op(self, model): pass @@ -83,7 +88,7 @@ class StreamingDataflowPartition(CustomOp): ) # verify the number of inputs - if len(self.onnx_node.input) == 1: + if len(self.onnx_node.input) >= 1: info_messages.append("The number of inputs is correct") else: info_messages.append("StreamingDataflowPartition needs 1 data input") diff --git a/src/finn/transformation/fpgadataflow/annotate_cycles.py b/src/finn/transformation/fpgadataflow/annotate_cycles.py new file mode 100644 index 0000000000000000000000000000000000000000..521c84952daf25982e574421dfba3ff0f7df91ae --- /dev/null +++ b/src/finn/transformation/fpgadataflow/annotate_cycles.py @@ -0,0 +1,59 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import finn.custom_op.registry as registry +from finn.transformation import Transformation +from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.core.modelwrapper import ModelWrapper +from finn.custom_op.registry import getCustomOp + + +class AnnotateCycles(Transformation): + """Annotate the estimate of clock cycles per sample taken by each fpgadataflow + node as an attribute on the node. + """ + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + # annotate node cycles + for node in graph.node: + if _is_fpgadataflow_node(node): + op_inst = registry.getCustomOp(node) + cycles = op_inst.get_exp_cycles() + op_inst.set_nodeattr("cycles_estimate", cycles) + elif node.op_type == "StreamingDataflowPartition": + # recurse into model to manually annotate per-layer cycles + sdp_model_filename = getCustomOp(node).get_nodeattr("model") + sdp_model = ModelWrapper(sdp_model_filename) + sdp_model = sdp_model.transform(AnnotateCycles()) + # save transformed model + sdp_model.save(sdp_model_filename) + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index 62ee92df54eee2b63d84657515d7fbc3a8808b81..da6fa1ff738690308a9b7686a5c92d7395ab50c8 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -32,6 +32,8 @@ from finn.transformation.move_reshape import _is_fpgadataflow_node from finn.analysis.fpgadataflow.res_estimation import res_estimation from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.analysis.fpgadataflow.post_synth_res import post_synth_res +from finn.core.modelwrapper import ModelWrapper +from finn.custom_op.registry import getCustomOp class AnnotateResources(Transformation): @@ -44,9 +46,10 @@ class AnnotateResources(Transformation): chosen mode (e.g. HLSSynthIP for hls) was previously run. """ - def __init__(self, mode): + def __init__(self, mode, override_res_dict=None): super().__init__() self.mode = mode + self.res_dict = override_res_dict def apply(self, model): graph = model.graph @@ -58,10 +61,33 @@ class AnnotateResources(Transformation): res_fxn = post_synth_res else: raise Exception("Unrecognized mode for AnnotateResources") - res_dict = model.analysis(res_fxn) + if self.res_dict is None: + self.res_dict = model.analysis(res_fxn) + children_dict = {} + # annotate node resources + for node in graph.node: + if _is_fpgadataflow_node(node) and node.name in self.res_dict.keys(): + op_inst = registry.getCustomOp(node) + op_inst.set_nodeattr("res_" + self.mode, str(self.res_dict[node.name])) + children_dict[node.name] = self.res_dict[node.name] + elif node.op_type == "StreamingDataflowPartition": + # recurse into model to manually annotate per-layer resources + sdp_model_filename = getCustomOp(node).get_nodeattr("model") + sdp_model = ModelWrapper(sdp_model_filename) + sdp_model = sdp_model.transform( + AnnotateResources(self.mode, self.res_dict) + ) + sdp_dict = sdp_model.get_metadata_prop("res_total_" + self.mode) + sdp_dict = eval(sdp_dict) + # save transformed model + sdp_model.save(sdp_model_filename) + # set res attribute for sdp node + getCustomOp(node).set_nodeattr("res_" + self.mode, str(sdp_dict)) + children_dict[node.name] = sdp_dict + self.res_dict.update(children_dict) total_dict = {} - for lname in res_dict.keys(): - layer_res_dict = res_dict[lname] + for lname in children_dict.keys(): + layer_res_dict = self.res_dict[lname] for r_type in layer_res_dict.keys(): r_amount = layer_res_dict[r_type] r_amount = float(r_amount) @@ -73,9 +99,4 @@ class AnnotateResources(Transformation): if "efficiency" in k: total_dict[k] = total_dict[k] / len(graph.node) model.set_metadata_prop("res_total_" + self.mode, str(total_dict)) - for node in graph.node: - if _is_fpgadataflow_node(node) and node.name in res_dict.keys(): - op_inst = registry.getCustomOp(node) - op_inst.set_nodeattr("res_" + self.mode, str(res_dict[node.name])) - return (model, False) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 6fe6e97dfc2f46a150de60011ee715dcb895a9c7..88f5fa926f73d5cb1919a02c83153cb8d1894711 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -29,6 +29,7 @@ from onnx import helper, TensorProto import numpy as np +import warnings from finn.core.datatype import DataType from finn.transformation import Transformation @@ -38,8 +39,10 @@ from finn.transformation.infer_datatypes import InferDataTypes from finn.transformation.general import SortGraph import finn.core.data_layout as DataLayout from finn.util.onnx import nchw_to_nhwc -import warnings from finn.util.basic import get_by_name +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) class InferConvInpGen(Transformation): @@ -108,6 +111,7 @@ class InferConvInpGen(Transformation): Padding=2 * pad, NumChannels=ifm_ch, inputDataType=dt.name, + SIMD=ifm_ch, ) graph.node.insert(node_ind, padding_node) @@ -488,6 +492,7 @@ class InferBinaryStreamingFCLayer(Transformation): graph.node.remove(n) graph_modified = True if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) @@ -508,7 +513,7 @@ class InferQuantizedStreamingFCLayer(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "MatMul": + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: mm_input = n.input[0] mm_weight = n.input[1] mm_output = n.output[0] @@ -621,6 +626,151 @@ class InferQuantizedStreamingFCLayer(Transformation): # remove old node graph.node.remove(n) graph_modified = True + if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferVVAU(Transformation): + """Convert MatMul layers with quantized inputs and weights to + Vector_Vector_Activate_Batch layers, if the sparsity annotation + of the weight matrix indicates that the MatMul layer belongs to + a depthwise convolution. Any immediately following MultiThreshold + layers will also be absorbed into the VVAU.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type == "MatMul" + and model.get_tensor_sparsity(n.input[1]) is not None + ): + sparsity = model.get_tensor_sparsity(n.input[1]) + try: + k = sparsity["dw"]["kernel_shape"] + except KeyError: + raise Exception( + """Sparsity doesn't indicate that MatMul + belongs to a depthwise convolution.""" + ) + + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # infer dense weight tensor from sparse weight matrix + # kernel size k which was extracted above and the value of + # the channels is used. + # the weight matrix has a shape of (k * k * Channels, Channels) + # we need to reverse the creation of the sparse weight matrix + # to achieve a weight tensor of shape (Channels, 1, k, k) + channels = int(W.shape[1]) + # transpose to achieve a shape of (k * k * Channels, Channels) + W = W.T + # reshape to (Channels, k, k, Channels) to transpose afterwards + # to (Channels, Channels, k, k) + W = W.reshape(channels, k, k, channels) + W = W.transpose(0, 3, 1, 2) + # now we can extract the values using a for loop over the channels + # and fill a zero numpy array in the correct shape + w_tensor = np.zeros((channels, 1, k, k)) + for ch in range(channels): + w_tensor[ch][0] = W[ch][ch] + model.set_initializer(mm_weight, w_tensor) + model.set_tensor_shape(mm_weight, (channels, 1, k, k)) + # create node with pe=channels as default + pe = channels + assert ( + channels % pe == 0 + ), "Requirement Channels divisable by PE is violated." + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # create VVAU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert ( + T.shape[0] == 1 or T.shape[0] == channels + ), """First dimension of + thresholds neither 1 nor Channels.""" + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + assert ( + scale == 1.0 + ), "out_scale must be equal to 1.0 for HLS conversion." + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert ( + int(actval) == actval + ), "out_bias must be integer for HLS conversion." + actval = int(actval) + assert (not odt.signed()) or ( + actval < 0 + ), "Signed output requres actval < 0" + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new Vector_Vector_Activate_Batch node + new_node = helper.make_node( + "Vector_Vector_Activate_Batch", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn", + backend="fpgadataflow", + resType="ap_resource_lut()", + PE=pe, + Dim=mm_in_shape[1], + Channels=channels, + Kernel=k, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + noActivation=0, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new VVAU node + new_node = helper.make_node( + "Vector_Vector_Activate_Batch", + [mm_input, mm_weight], + [mm_output], + domain="finn", + backend="fpgadataflow", + resType="ap_resource_lut()", + PE=pe, + Dim=mm_in_shape[1], + Channels=channels, + Kernel=k, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + noActivation=1, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py index e0f990600d9ca4be748b662b47ce8296d3d462ce..fb8b4358abd772d13c355f797649dc3b51975b4d 100644 --- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py +++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py @@ -45,58 +45,91 @@ class CreateDataflowPartition(Transformation): super().__init__() def apply(self, model): - # TODO we currently assume that all dataflow nodes are connected to - # each other, forming a single partition. check the assumption and/or - # improve this. - all_nodes = list(model.graph.node) - df_nodes = filter( - lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes - ) - df_nodes = filter( - lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8") - == "fpgadataflow", - df_nodes, - ) - df_nodes = list(df_nodes) - non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes) - non_df_nodes = list(non_df_nodes) - - if len(df_nodes) == 0: - # no changes if no dataflow nodes are present - return (model, False) - else: - # partition the model into two models - df_model = copy.deepcopy(model) - non_df_model = model - # remove all non-dataflow nodes from the dataflow model - for node_to_remove in non_df_nodes: - df_model.graph.node.remove(node_to_remove) - # identify the entry and exit points for the dataflow part - df_in = df_model.graph.node[0].input[0] - df_out = df_model.graph.node[-1].output[0] - df_in_vi = df_model.get_tensor_valueinfo(df_in) - df_out_vi = df_model.get_tensor_valueinfo(df_out) - # set df graph in/out to be df_in/df_out - df_model.graph.input.remove(df_model.graph.input[0]) - df_model.graph.input.insert(0, df_in_vi) - df_model.graph.output.remove(df_model.graph.output[0]) - df_model.graph.output.insert(0, df_out_vi) - df_model_dir = make_build_dir("dataflow_partition_") - df_model_filename = df_model_dir + "/df_model.onnx" - df_model.save(df_model_filename) - # remove all dataflow nodes from the non-dataflow model - # keep track of where the dataflow part starts - df_start_ind = all_nodes.index(df_nodes[0]) - for node_to_remove in df_nodes: - non_df_model.graph.node.remove(node_to_remove) - # create StreamingDataflow node with df_in/df_out io - df_node = helper.make_node( - "StreamingDataflowPartition", - [df_in], - [df_out], - # use the model attribute to mark the df model - model=df_model_filename, + target_partition_id = 0 + # we currently assume that all dataflow nodes belonging to the same partition + # are connected to each other and there is a single input/output to/from each. + # NOTE: all dataflow nodes with no partition_id set are moved to partition 0 + # TODO: check the assumption and/or improve this. + while True: + all_nodes = list(model.graph.node) + df_nodes = filter( + lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes + ) + df_nodes = filter( + lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8") + == "fpgadataflow" + and ( + get_by_name(x.attribute, "partition_id") is None + or get_by_name(x.attribute, "partition_id").i == target_partition_id + ) + and x.op_type != "StreamingDataflowPartition", + df_nodes, ) - non_df_model.graph.node.insert(df_start_ind, df_node) + df_nodes = list(df_nodes) + non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes) + non_df_nodes = list(non_df_nodes) + + if len(df_nodes) == 0: + # no changes if no dataflow nodes are present + break + else: + # partition the model into two models + df_model = copy.deepcopy(model) + non_df_model = model + # remove all non-dataflow nodes from the dataflow model + for node_to_remove in non_df_nodes: + df_model.graph.node.remove(node_to_remove) + # identify the entry and exit points for the dataflow part + df_in = df_model.graph.node[0].input[0] + df_out = df_model.graph.node[-1].output[0] + df_in_vi = df_model.get_tensor_valueinfo(df_in) + df_out_vi = df_model.get_tensor_valueinfo(df_out) + # set df graph in/out to be df_in/df_out + df_model.graph.input.remove(df_model.graph.input[0]) + df_model.graph.input.insert(0, df_in_vi) + df_model.graph.output.remove(df_model.graph.output[0]) + df_model.graph.output.insert(0, df_out_vi) + # parse StreamingFCLayers looking for external weight memories + fc_extw_nodes = filter( + lambda x: x.op_type == "StreamingFCLayer_Batch" + and get_by_name(x.attribute, "mem_mode") is not None + and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") + == "external", + df_nodes, + ) + fc_extw_nodes = list(fc_extw_nodes) + extra_df_inputs = [] + + for i in range(len(fc_extw_nodes)): + fc_weight_vi = df_model.get_tensor_valueinfo( + fc_extw_nodes[i].input[1] + ) + df_model.graph.input.insert(i + 1, fc_weight_vi) + extra_df_inputs.append(fc_extw_nodes[i].input[1]) + + # save model + df_model_dir = make_build_dir( + "dataflow_partition" + str(target_partition_id) + "_" + ) + df_model_filename = df_model_dir + "/df_model.onnx" + df_model.cleanup() + df_model.save(df_model_filename) + # remove all dataflow nodes from the non-dataflow model + # keep track of where the dataflow part starts + df_start_ind = all_nodes.index(df_nodes[0]) + for node_to_remove in df_nodes: + non_df_model.graph.node.remove(node_to_remove) + # create StreamingDataflow node with df_in/df_out io + df_node = helper.make_node( + "StreamingDataflowPartition", + [df_in] + extra_df_inputs, + [df_out], + # use the model attribute to mark the df model + model=df_model_filename, + domain="finn", + ) + non_df_model.graph.node.insert(df_start_ind, df_node) + model = non_df_model + target_partition_id += 1 - return (non_df_model, False) + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 018ad385f33a8e0aea4aa42599fd47fe5dae57dd..90b4b6c47e6e353c1b606d6918eb271e9c0619c5 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -210,7 +210,8 @@ class CreateStitchedIP(Transformation): assert ( node_inst.get_nodeattr("Direction") == "in" ), """Output TLastMarker incorrect direction""" - elif node.op_type == "IODMA": + elif node.op_type == "IODMA" and len(model.graph.node) != 1: + # don't apply this check for a 1-node partition assert ( node_inst.get_nodeattr("direction") == "in" ), """Input DMA incorrect direction""" @@ -241,17 +242,11 @@ class CreateStitchedIP(Transformation): if model.find_consumers(node.output[0]) is None: # last node in graph self.connect_m_axis_external(node) - # ensure it is a TLastMarker to have a valid TLast signal - assert ( - node.op_type == "TLastMarker" or node.op_type == "IODMA" - ), """Last node is not TLastMarker or DMA. - Please run transformation InsertTLastMarker/InsertIODMA to ensure - a valid TLast signal""" if node.op_type == "TLastMarker": assert ( node_inst.get_nodeattr("Direction") == "out" ), """Output TLastMarker incorrect direction""" - elif node.op_type == "IODMA": + elif node.op_type == "IODMA" and len(model.graph.node) != 1: assert ( node_inst.get_nodeattr("direction") == "out" ), """Output DMA incorrect direction""" diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index b01f8cbe5c48db6c5288b2db1a8b009ea09ce6c0..6f7fde0c4faba09e584eb578819f44c18639bc9d 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -118,8 +118,11 @@ class InsertFIFO(Transformation): graph_modified = True if graph_modified is False: - # insert FIFO as first node - if graph.node[0].op_type != "StreamingFIFO": + # insert FIFO as first node, except when first node is DMA + if ( + graph.node[0].op_type != "StreamingFIFO" + and graph.node[0].op_type != "IODMA" + ): n = graph.node[0] n_input = n.input[0] n0 = getCustomOp(n) @@ -153,8 +156,11 @@ class InsertFIFO(Transformation): # set fifo output tensor as new input tensor of second node n.input[0] = fifo_output_tensor.name - # insert FIFO as last node - if graph.node[-1].op_type != "StreamingFIFO": + # insert FIFO as last node, except when last node is DMA + if ( + graph.node[-1].op_type != "StreamingFIFO" + and graph.node[-1].op_type != "IODMA" + ): n = graph.node[-1] assert ( n.op_type != "TLastMarker" diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index e4368edea717f7499481e9b1c6ac20f7d5bb5f58..72e5ec4fdd721ecf549adaf7ddd38db4636bce27 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -81,8 +81,8 @@ class InsertIODMA(Transformation): # check if tensor is NHWC assert ( model.get_tensor_layout(graph_out_name) == DataLayout.NHWC - or model.get_tensor_layout(graph_in_name) == DataLayout.NC - ), "Data layout of tensors must be NHWC or NC" + or model.get_tensor_layout(graph_out_name) == DataLayout.NC + ), "Data layout of output tensor must be NHWC or NC" out_shape = model.get_tensor_shape(graph_out_name) out_dtype = model.get_tensor_datatype(graph_out_name) # determine the feasible interface width @@ -120,7 +120,7 @@ class InsertIODMA(Transformation): assert ( model.get_tensor_layout(graph_in_name) == DataLayout.NHWC or model.get_tensor_layout(graph_in_name) == DataLayout.NC - ), "Data layout of tensors must be NHWC or NC" + ), "Data layout of input tensor must be NHWC or NC" in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) # determine the feasible interface width @@ -171,6 +171,7 @@ class InsertIODMA(Transformation): # calculate width of stream output from DMA pe = get_by_name(fc_node.attribute, "PE").i simd = get_by_name(fc_node.attribute, "SIMD").i + assert pe * simd == w_shape[0], "Malformed weight matrix" streamWidth = simd * pe * w_dtype.bitwidth() # make new buffer fc_node_in = oh.make_tensor_value_info( @@ -178,12 +179,13 @@ class InsertIODMA(Transformation): ) model.graph.value_info.append(fc_node_in) model.set_tensor_datatype(fc_node_in.name, w_dtype) + model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name)) dma_node = oh.make_node( "IODMA", [fc_w_name], [fc_node_in.name], - numInputVectors=w_shape[:-1], - NumChannels=w_shape[-1], + numInputVectors=[w_shape[1]], + NumChannels=w_shape[0], dataType=str(w_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 04dd437af27b9fbe18b2255c20a8e4acda03b3d0..bbb0e43fda464e919a7d8c9dcd25e08a49b33cec 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -38,7 +38,8 @@ import numpy as np class InsertTLastMarker(Transformation): """Ensure that the graph is started/terminated with a TLastMarker node, inserting - one if necessary. Use constructor args to determine type of TLastMarker to be inserted. + one if necessary. + Use constructor args to determine type of TLastMarker to be inserted. More information available on the TLastMarker documentation. """ @@ -90,41 +91,78 @@ class InsertTLastMarker(Transformation): graph_modified = True # if both is True, also insert marker on input if self.both: - graph_in_name = model.graph.input[0].name - first_node = model.find_consumer(graph_in_name) - if first_node.op_type != "TLastMarker" and not ( - first_node.op_type == "IODMA" - and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") - == "in" - ): + # detect and parse graph inputs + insert_idx = 0 + graph_in_names = [x.name for x in model.graph.input] + for graph_in_name in graph_in_names: + first_node = model.find_consumers(graph_in_name) + # skip if no consumers (this may be the case for unused initializers) + # TODO: fix this with a cleanup transform + if first_node is None: + continue + assert len(first_node) == 1, "Input fans out to multiple nodes" + first_node = first_node[0] + # several scenarios exclude the node: + # 1. node is a FC layer with internal weights, in which case + # the input is in the list of graph inputs because it has an + # initializer (TODO: fix this with a clean-up transform) + if ( + first_node.op_type == "StreamingFCLayer_Batch" + and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") + != "external" + ): + continue + # 2. node is either a TLastMarker or an input IODMA + if first_node.op_type != "TLastMarker" and not ( + first_node.op_type == "IODMA" + and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") + == "in" + ): - custom_op = getCustomOp(first_node) - num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) - stream_width = int(custom_op.get_instream_width()) - in_shape = model.get_tensor_shape(graph_in_name) - in_dtype = model.get_tensor_datatype(graph_in_name) - elem_width = in_dtype.bitwidth() - # make new buffer - first_node_in = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape - ) - model.graph.value_info.append(first_node_in) - model.set_tensor_datatype(first_node_in.name, in_dtype) - # reroute final node output to first_node_in_name - first_node.input[0] = first_node_in.name - tlast_node = oh.make_node( - "TLastMarker", - [graph_in_name], - [first_node_in.name], - NumIters=num_iters, - StreamWidth=stream_width, - ElemWidth=elem_width, - DynIters=(1 if self.dyniters else 0), - Direction="in", - Protocol=("external" if self.external else "internal"), - domain="finn", - backend="fpgadataflow", - ) - model.graph.node.insert(0, tlast_node) - graph_modified = True + custom_op = getCustomOp(first_node) + num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) + inp_idx = list(first_node.input).index(graph_in_name) + if inp_idx > 0: + if ( + first_node.op_type == "StreamingFCLayer_Batch" + and inp_idx == 1 + ): + stream_width = int(custom_op.get_weightstream_width()) + elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: + stream_width = int(custom_op.get_instream_width()) + else: + raise Exception("No method to determine stream width") + else: + stream_width = int(custom_op.get_instream_width()) + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + elem_width = in_dtype.bitwidth() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + ini = model.get_initializer(graph_in_name) + # copy initializer if it exists + if ini is not None: + model.set_initializer(first_node_in.name, ini) + # reroute final node output to first_node_in_name + first_node.input[inp_idx] = first_node_in.name + tlast_node = oh.make_node( + "TLastMarker", + [graph_in_name], + [first_node_in.name], + NumIters=num_iters, + StreamWidth=stream_width, + ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="in", + Protocol=("external" if self.external else "internal"), + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(insert_idx, tlast_node) + graph_modified = True + insert_idx += 1 return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 18d3db18da089a5dda4dbb6d97180dd4a20613b5..fc326b4a25a9784f3919b4246ec2b8f54fb881f4 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -26,9 +26,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os -import shutil +import shutil from finn.custom_op.registry import getCustomOp from finn.transformation import Transformation from finn.util.basic import gen_finn_dt_tensor, get_finn_root, make_build_dir @@ -42,19 +41,18 @@ class MakePYNQDriver(Transformation): accelerator, including data packing/unpacking. The MakePYNQProject transformation must have been already applied. + platform: one of ["zynq", "zynq-iodma", "alveo"] + Outcome if successful: sets the pynq_driver_dir attribute in the ONNX ModelProto's metadata_props field, with the created driver dir as the value. """ - def __init__(self): + def __init__(self, platform): super().__init__() + self.platform = platform def apply(self, model): - vivado_pynq_proj = model.get_metadata_prop("vivado_pynq_proj") - if vivado_pynq_proj is None or (not os.path.isdir(vivado_pynq_proj)): - raise Exception("No PYNQ project found, apply MakePYNQProject first.") - # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) @@ -67,11 +65,21 @@ class MakePYNQDriver(Transformation): o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) i_tensor_dt = model.get_tensor_datatype(i_tensor_name) o_tensor_dt = model.get_tensor_datatype(o_tensor_name) - # extract HLSCustomOp instances to get folded i/o shapes - first_node = getCustomOp(model.find_consumer(i_tensor_name)) - last_node = getCustomOp(model.find_producer(o_tensor_name)) - i_tensor_shape_folded = tuple(first_node.get_folded_input_shape()) - o_tensor_shape_folded = tuple(last_node.get_folded_output_shape()) + # handle folded i/o shapes due to differences in DMA engines + if self.platform == "zynq": + # extract HLSCustomOp instances to get folded i/o shapes + first_node = getCustomOp(model.find_consumer(i_tensor_name)) + last_node = getCustomOp(model.find_producer(o_tensor_name)) + i_tensor_shape_folded = tuple(first_node.get_folded_input_shape()) + o_tensor_shape_folded = tuple(last_node.get_folded_output_shape()) + else: + i_tensor_shape_folded = list(i_tensor_shape_normal) + i_tensor_shape_folded.insert(-1, 1) + i_tensor_shape_folded = tuple(i_tensor_shape_folded) + o_tensor_shape_folded = list(o_tensor_shape_normal) + o_tensor_shape_folded.insert(-1, 1) + o_tensor_shape_folded = tuple(o_tensor_shape_folded) + # generate dummy folded i/o tensors and their packed versions i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded) o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded) @@ -98,6 +106,7 @@ class MakePYNQDriver(Transformation): ret = ret.replace("[1,", "[%s," % batch_var_name) return ret + driver = driver.replace("$PLATFORM$", self.platform) driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt)) driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal)) driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded)) @@ -108,7 +117,12 @@ class MakePYNQDriver(Transformation): driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) # clock settings for driver - clk_ns = float(model.get_metadata_prop("clk_ns")) + clk_ns = model.get_metadata_prop("clk_ns") + # default to 10ns / 100 MHz if property not set + if clk_ns is None: + clk_ns = 10.0 + else: + clk_ns = float(clk_ns) fclk_mhz = 1 / (clk_ns * 0.001) # TODO change according to PYNQ board? driver = driver.replace("$CLK_NAME$", "fclk0_mhz") diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py index a874d7a7c702e1b3e9125fc031aa65dc287a407d..5e45d6f230503668a15d784e3c6afa45560fe004 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py @@ -128,6 +128,8 @@ class MakePYNQProject(Transformation): # filename for the synth utilization report synth_report_filename = vivado_pynq_proj_dir + "/synth_report.xml" model.set_metadata_prop("vivado_synth_rpt", synth_report_filename) + # set platform attribute for correct remote execution + model.set_metadata_prop("platform", "zynq") # get metadata property clk_ns to calculate clock frequency clk_ns = float(model.get_metadata_prop("clk_ns")) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py new file mode 100644 index 0000000000000000000000000000000000000000..095327be0d3c36f201bcf343d8aea61aa069b8e1 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -0,0 +1,319 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import subprocess + +from finn.custom_op.registry import getCustomOp +from finn.transformation import Transformation +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import get_by_name, make_build_dir +from finn.util.basic import get_num_default_workers +from finn.util.basic import pynq_part_map + +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.floorplan import Floorplan +from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.transformation.infer_data_layouts import InferDataLayouts +from shutil import copy +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver + +from . import templates + + +def collect_ip_dirs(model, ipstitch_path): + # collect list of all IP dirs + ip_dirs = [] + for node in model.graph.node: + ip_dir_attribute = get_by_name(node.attribute, "ip_path") + assert ( + ip_dir_attribute is not None + ), """Node attribute "ip_path" is + empty. Please run transformation HLSSynth_ipgen first.""" + ip_dir_value = ip_dir_attribute.s.decode("UTF-8") + assert os.path.isdir( + ip_dir_value + ), """The directory that should + contain the generated ip blocks doesn't exist.""" + ip_dirs += [ip_dir_value] + ip_dirs += [ipstitch_path + "/ip"] + return ip_dirs + + +class MakeZYNQProject(Transformation): + """Create a Vivado overlay project (including the shell infrastructure) + from the already-stitched IP block for this graph. + All nodes in the graph must have the fpgadataflow backend attribute, + and the CreateStitchedIP transformation must have been previously run on + the graph. This is functionally equivalent with MakePYNQProject but does + not use Pynq infrastructure and instead creates a fully custom block design. + However, this transform requires DMAs in the accelerator design. + + Outcome if successful: sets the vivado_pynq_proj attribute in the ONNX + ModelProto's metadata_props field, with the created project dir as the + value. + """ + + def __init__(self, platform, enable_debug=False): + super().__init__() + self.platform = platform + self.enable_debug = 1 if enable_debug else 0 + + def apply(self, model): + + # create a config file and empty list of xo files + config = [] + idma_idx = 0 + odma_idx = 0 + aximm_idx = 0 + axilite_idx = 0 + global_clk_ns = 0 + instance_names = {} + for node in model.graph.node: + assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" + sdp_node = getCustomOp(node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + + ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj") + if ipstitch_path is None or (not os.path.isdir(ipstitch_path)): + raise Exception( + "No stitched IPI design found for %s, apply CreateStitchedIP first." + % node.name + ) + + vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv") + if vivado_stitch_vlnv is None: + raise Exception( + "No vlnv found for %s, apply CreateStitchedIP first." % node.name + ) + + ip_dirs = ["list"] + ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path) + ip_dirs_str = "[%s]" % (" ".join(ip_dirs)) + config.append( + "set_property ip_repo_paths " + "[concat [get_property ip_repo_paths [current_project]] %s] " + "[current_project]" % ip_dirs_str + ) + config.append("update_ip_catalog -rebuild -scan_changes") + + # get metadata property clk_ns to calculate clock frequency + clk_ns = float(kernel_model.get_metadata_prop("clk_ns")) + if clk_ns > global_clk_ns: + global_clk_ns = clk_ns + + # gather info on connectivity + # assume each node connected to outputs/inputs is DMA: + # has axis, aximm and axilite + # everything else is axis-only + # assume only one connection from each ip to the next + # all aximm allocated to DDR[0] + # all kernels allocated to SLR0 + producer = model.find_producer(node.input[0]) + consumer = model.find_consumers(node.output[0]) + # define kernel instances + # name kernels connected to graph inputs as idmaxx + # name kernels connected to graph inputs as odmaxx + if producer is None or consumer is None: + if producer is None: + instance_names[node.name] = "idma" + str(idma_idx) + elif consumer is None: + instance_names[node.name] = "odma" + str(odma_idx) + config.append( + "create_bd_cell -type ip -vlnv %s %s" + % (vivado_stitch_vlnv, instance_names[node.name]) + ) + config.append( + "connect_bd_intf_net [get_bd_intf_pins %s/m_axi_gmem0] " + "[get_bd_intf_pins smartconnect_0/S%02d_AXI]" + % (instance_names[node.name], aximm_idx) + ) + config.append( + "connect_bd_intf_net [get_bd_intf_pins %s/s_axi_control] " + "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" + % (instance_names[node.name], axilite_idx) + ) + idma_idx += 1 + aximm_idx += 1 + axilite_idx += 1 + else: + instance_names[node.name] = node.name + config.append( + "create_bd_cell -type ip -vlnv %s %s" + % (vivado_stitch_vlnv, instance_names[node.name]) + ) + config.append( + "connect_bd_net [get_bd_pins %s/ap_clk] " + "[get_bd_pins smartconnect_0/aclk]" % instance_names[node.name] + ) + config.append( + "connect_bd_net [get_bd_pins %s/ap_rst_n] " + "[get_bd_pins smartconnect_0/aresetn]" % instance_names[node.name] + ) + # connect streams + if producer is not None: + for i in range(len(node.input)): + producer = model.find_producer(node.input[i]) + if producer is not None: + j = list(producer.output).index(node.input[i]) + config.append( + "connect_bd_intf_net [get_bd_intf_pins %s/s_axis_%d] " + "[get_bd_intf_pins %s/m_axis_%d]" + % ( + instance_names[node.name], + i, + instance_names[producer.name], + j, + ) + ) + + # create a temporary folder for the project + vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_") + model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir) + + fclk_mhz = int(1 / (global_clk_ns * 0.001)) + + # create a TCL recipe for the project + ipcfg = vivado_pynq_proj_dir + "/ip_config.tcl" + config = "\n".join(config) + "\n" + with open(ipcfg, "w") as f: + f.write( + templates.custom_zynq_shell_template + % ( + fclk_mhz, + axilite_idx, + aximm_idx, + self.platform, + pynq_part_map[self.platform], + config, + self.enable_debug, + get_num_default_workers(), + ) + ) + + # create a TCL recipe for the project + synth_project_sh = vivado_pynq_proj_dir + "/synth_project.sh" + working_dir = os.environ["PWD"] + with open(synth_project_sh, "w") as f: + f.write("#!/bin/bash \n") + f.write("cd {}\n".format(vivado_pynq_proj_dir)) + f.write("vivado -mode tcl -source %s\n" % ipcfg) + f.write("cd {}\n".format(working_dir)) + + # call the synthesis script + bash_command = ["bash", synth_project_sh] + process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) + process_compile.communicate() + bitfile_name = ( + vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit" + ) + if not os.path.isfile(bitfile_name): + raise Exception("Synthesis failed, no bitfile found") + deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit" + copy(bitfile_name, deploy_bitfile_name) + # set bitfile attribute + model.set_metadata_prop("vivado_pynq_bitfile", deploy_bitfile_name) + # set platform attribute for correct remote execution + model.set_metadata_prop("platform", "zynq-iodma") + hwh_name = ( + vivado_pynq_proj_dir + + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh" + ) + if not os.path.isfile(hwh_name): + raise Exception("Synthesis failed, no hardware handoff file found") + deploy_hwh_name = vivado_pynq_proj_dir + "/resizer.hwh" + copy(hwh_name, deploy_hwh_name) + # filename for the synth utilization report + synth_report_filename = vivado_pynq_proj_dir + "/synth_report.xml" + model.set_metadata_prop("vivado_synth_rpt", synth_report_filename) + return (model, False) + + +class ZynqBuild(Transformation): + """Best-effort attempt at building the accelerator for Zynq.""" + + def __init__(self, platform, period_ns, enable_debug=False): + super().__init__() + self.fpga_part = pynq_part_map[platform] + self.period_ns = period_ns + self.platform = platform + self.enable_debug = enable_debug + + def apply(self, model): + # first infer layouts + model = model.transform(InferDataLayouts()) + # prepare at global level, then break up into kernels + prep_transforms = [ + MakePYNQDriver(platform="zynq-iodma"), + InsertIODMA(64), + InsertDWC(), + Floorplan(), + CreateDataflowPartition(), + ] + for trn in prep_transforms: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + # Build each kernel individually + sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition") + for sdp_node in sdp_nodes: + prefix = sdp_node.name + "_" + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) + kernel_model.save(dataflow_model_filename) + kernel_model = kernel_model.transform( + PrepareIP(self.fpga_part, self.period_ns) + ) + kernel_model = kernel_model.transform(HLSSynthIP()) + kernel_model = kernel_model.transform(ReplaceVerilogRelPaths()) + kernel_model = kernel_model.transform( + CreateStitchedIP( + self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True + ) + ) + kernel_model.save(dataflow_model_filename) + # Assemble design from IPs + model = model.transform( + MakeZYNQProject(self.platform, enable_debug=self.enable_debug) + ) + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py new file mode 100644 index 0000000000000000000000000000000000000000..2c54a5efbd3b28f0fbfd074b512929edab234e78 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.registry import getCustomOp +from finn.transformation import Transformation +from finn.util.fpgadataflow import is_fpgadataflow_node + + +class MinimizeAccumulatorWidth(Transformation): + """For relevant nodes, call the accumulator width minimization + functions to save on resources. May alter tensor DataType for + certain nodes if they produce an accumulator as result.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + for node in model.graph.node: + if is_fpgadataflow_node(node) is True: + inst = getCustomOp(node) + if hasattr(inst, "minimize_accumulator_width"): + inst.minimize_accumulator_width(model) + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index ab9fd03251819aee72f74cc0c1fa17b99b1e05a4..3bd74ec6a2071db820a35a9440eedd74092354e1 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -104,9 +104,10 @@ from finn.core.datatype import DataType from pynq.ps import Clocks class FINNAccelDriver(): - def __init__(self, N, bitfile): + def __init__(self, N, bitfile, platform="$PLATFORM$"): \"\"\"Instantiate the FINN accelerator driver. Gets batchsize (N) as integer and path to bitfile as string.\"\"\" + self.platform = platform self.N = N # input FINN DataType self.idt = $INPUT_FINN_DATATYPE$ @@ -119,21 +120,37 @@ class FINNAccelDriver(): self.oshape_folded = $OUTPUT_SHAPE_FOLDED$ self.ishape_packed = $INPUT_SHAPE_PACKED$ # datatype np.uint8 self.oshape_packed = $OUTPUT_SHAPE_PACKED$ # datatype np.uint8 - # clock frequency - self.fclk_mhz = $CLOCK_FREQ_MHZ$ # load bitfile and set up accelerator self.ol = Overlay(bitfile) - # set the clock frequency as specified by user during transformations - Clocks.$CLK_NAME$ = self.fclk_mhz - self.dma = self.ol.axi_dma_0 - self.ctrl_regs = self.ol.resize_accel_0 # neuron folding factor of output = iterations per sample self.itersPerSample = self.oshape_packed[-2] - # AXI lite register offset for number of iterations - # used by TLastMarker to signal end of transmission for AXI CDMA - self.REG_OFFSET_NUM_ITERS = 0x10 - # set up TLastMarker with correct num. samples - self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample) + if self.platform == "zynq": + # clock frequency + self.fclk_mhz = $CLOCK_FREQ_MHZ$ + # set the clock frequency as specified by user during transformations + if self.fclk_mhz > 0: + Clocks.$CLK_NAME$ = self.fclk_mhz + self.dma = self.ol.axi_dma_0 + self.ctrl_regs = self.ol.resize_accel_0 + + # AXI lite register offset for number of iterations + # used by TLastMarker to signal end of transmission for AXI CDMA + self.REG_OFFSET_NUM_ITERS = 0x10 + # set up TLastMarker with correct num. samples + self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample) + elif self.platform == "alveo": + self.idma = self.ol.idma0 + self.odma = self.ol.odma0 + elif self.platform == "zynq-iodma": + self.idma = self.ol.idma0 + self.odma = self.ol.odma0 + # clock frequency + self.fclk_mhz = $CLOCK_FREQ_MHZ$ + # set the clock frequency as specified by user during transformations + if self.fclk_mhz > 0: + Clocks.$CLK_NAME$ = self.fclk_mhz + else: + raise ValueError("Supported platforms are zynq zynq-iodma alveo") # allocate a PYNQ buffer for the packed input and buffer self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) @@ -176,19 +193,42 @@ class FINNAccelDriver(): np.copyto(self.ibuf_packed_device, data) def execute(self): - \"\"\"Executes accelerator by setting up the DMA and - waiting until all transfers complete. Uses only member variables and + \"\"\"Executes accelerator by setting up the DMA(s) and + waiting until all transfers/calls complete. Uses only member variables and returns nothing.\"\"\" - dma = self.dma - dma.sendchannel.transfer(self.ibuf_packed_device) - dma.recvchannel.transfer(self.obuf_packed_device) - dma.sendchannel.wait() - dma.recvchannel.wait() + if self.platform == "zynq": + dma = self.dma + dma.sendchannel.transfer(self.ibuf_packed_device) + dma.recvchannel.transfer(self.obuf_packed_device) + dma.sendchannel.wait() + dma.recvchannel.wait() + elif self.platform == "zynq-iodma": + # manually launch IODMAs since signatures are missing + self.idma.write(0x10, self.ibuf_packed_device.device_address) + self.idma.write(0x1c, self.N) + self.odma.write(0x10, self.obuf_packed_device.device_address) + self.odma.write(0x1c, self.N) + self.idma.write(0x00, 1) + self.odma.write(0x00, 1) + # wait until output IODMA is finished + status = self.odma.read(0x00) + while status & 0x2 == 0: + status = self.odma.read(0x00) + + elif self.platform == "alveo": + self.ibuf_packed_device.sync_to_device() + self.idma.start(self.ibuf_packed_device, self.N) + self.odma.start(self.obuf_packed_device, self.N) + self.idma.wait() + self.odma.wait() + self.obuf_packed_device.sync_from_device() + if __name__ == "__main__": parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name') parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute") + parser.add_argument('--platform', help='Target platform: zynq zynq-iodma alveo', default="zynq") parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1) parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit") parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy") @@ -196,13 +236,14 @@ if __name__ == "__main__": # parse arguments args = parser.parse_args() exec_mode = args.exec_mode + platform = args.platform N = args.batchsize bitfile = args.bitfile inputfile = args.inputfile outputfile = args.outputfile # instantiate FINN accelerator driver and pass batchsize and bitfile - finnDriver = FINNAccelDriver(N, bitfile) + finnDriver = FINNAccelDriver(N, bitfile, platform) # for the remote execution the data from the input npy file has to be loaded, # packed and copied to the PYNQ buffer @@ -258,3 +299,126 @@ if __name__ == "__main__": """ + +custom_zynq_shell_template = """ +set FREQ_MHZ %s +set NUM_AXILITE %d +if {$NUM_AXILITE > 9} { + error "Maximum 10 AXI-Lite interfaces supported" +} +set NUM_AXIMM %d +set BOARD %s +set FPGA_PART %s +create_project finn_zynq_link ./ -part $FPGA_PART + +# set board part repo paths to find PYNQ-Z1/Z2 +set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]] +set paths_param [get_param board.repoPaths] +lappend paths_prop /workspace/finn/board_files +lappend paths_param /workspace/finn/board_files +set_property BOARD_PART_REPO_PATHS $paths_prop [current_project] +set_param board.repoPaths $paths_param + +if {$BOARD == "ZCU104"} { + set_property board_part xilinx.com:zcu104:part0:1.1 [current_project] + set ZYNQ_TYPE "zynq_us+" +} elseif {$BOARD == "Ultra96"} { + set ZYNQ_TYPE "zynq_us+" +} elseif {$BOARD == "Pynq-Z2"} { + set ZYNQ_TYPE "zynq_7000" +} elseif {$BOARD == "Pynq-Z1"} { + set ZYNQ_TYPE "zynq_7000" + set_property board_part www.digilentinc.com:pynq-z1:part0:1.0 [current_project] +} else { + puts "Unrecognized board" +} + +create_bd_design "top" +if {$ZYNQ_TYPE == "zynq_us+"} { + create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ps + apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ps] + #activate one slave port, deactivate the second master port + set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {1}] [get_bd_cells zynq_ps] + set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {0}] [get_bd_cells zynq_ps] + #set frequency of PS clock (this can't always be exactly met) + set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] +} elseif {$ZYNQ_TYPE == "zynq_7000"} { + create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 zynq_ps + apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells zynq_ps] + set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps] + set_property -dict [list CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] +} else { + puts "Unrecognized Zynq type" +} + +#instantiate axi interconnect, axi smartconnect +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0 +create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0 +#set number of axilite interfaces, and number of axi master interfaces +set_property -dict [list CONFIG.NUM_SI $NUM_AXILITE] [get_bd_cells smartconnect_0] +set_property -dict [list CONFIG.NUM_MI $NUM_AXIMM] [get_bd_cells axi_interconnect_0] + +#create reset controller and connect interconnects to PS +if {$ZYNQ_TYPE == "zynq_us+"} { + connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0_FPD] + connect_bd_intf_net [get_bd_intf_pins zynq_ps/M_AXI_HPM0_FPD] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/S00_AXI] + #connect interconnect clocks and resets + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/ACLK] + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/S00_ACLK] + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins zynq_ps/saxihp0_fpd_aclk] +} elseif {$ZYNQ_TYPE == "zynq_7000"} { + connect_bd_intf_net -boundary_type upper [get_bd_intf_pins zynq_ps/M_AXI_GP0] [get_bd_intf_pins axi_interconnect_0/S00_AXI] + connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0] + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/ACLK] + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/S00_ACLK] + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins zynq_ps/S_AXI_HP0_ACLK] +} +connect_bd_net [get_bd_pins axi_interconnect_0/ARESETN] [get_bd_pins smartconnect_0/aresetn] + +#custom IP instantiations/connections start here +%s + +# set up debug +if {%d == 1} { + set_property HDL_ATTRIBUTE.DEBUG true [get_bd_intf_nets {idma0_m_axis_0}] + set_property HDL_ATTRIBUTE.DEBUG true [get_bd_intf_nets {StreamingDataflowPartition_1_m_axis_0}] + set_property HDL_ATTRIBUTE.DEBUG true [get_bd_intf_nets {smartconnect_0_M00_AXI}] + apply_bd_automation -rule xilinx.com:bd_rule:debug -dict [list \ + [get_bd_intf_nets smartconnect_0_M00_AXI] {AXI_R_ADDRESS "Data and Trigger" AXI_R_DATA "Data and Trigger" AXI_W_ADDRESS "Data and Trigger" AXI_W_DATA "Data and Trigger" AXI_W_RESPONSE "Data and Trigger" CLK_SRC "/zynq_ps/FCLK_CLK0" SYSTEM_ILA "Auto" APC_EN "0" } \ + [get_bd_intf_nets idma0_m_axis_0] {AXIS_SIGNALS "Data and Trigger" CLK_SRC "/zynq_ps/FCLK_CLK0" SYSTEM_ILA "Auto" APC_EN "0" } \ + [get_bd_intf_nets StreamingDataflowPartition_1_m_axis_0] {AXIS_SIGNALS "Data and Trigger" CLK_SRC "/zynq_ps/FCLK_CLK0" SYSTEM_ILA "Auto" APC_EN "0" } \ + ] +} + +#finalize clock and reset connections for interconnects +set i 0 +while {$i < $NUM_AXILITE} { + apply_bd_automation -quiet -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/M0${i}_ACLK] + incr i +} + +save_bd_design +assign_bd_address +validate_bd_design + +set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [ get_files top.bd ] +make_wrapper -files [get_files top.bd] -import -fileset sources_1 -top + +set_property strategy Flow_PerfOptimized_high [get_runs synth_1] +set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AlternateRoutability [get_runs synth_1] +set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1] +set_property strategy Performance_ExtraTimingOpt [get_runs impl_1] +set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE Explore [get_runs impl_1] +set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1] +set_property STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1] +set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1] + +# out-of-context synth can't be used for bitstream generation +# set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} -value {-mode out_of_context} -objects [get_runs synth_1] +launch_runs -to_step write_bitstream impl_1 -jobs %d +wait_on_run [get_runs impl_1] + +# generate synthesis report +open_run synth_1 -name synth_1 +report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -format xml +""" diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py new file mode 100644 index 0000000000000000000000000000000000000000..2df58c537250c102ee85a685fc32904ee879e38f --- /dev/null +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -0,0 +1,322 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import subprocess + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation import Transformation +from finn.custom_op.registry import getCustomOp + +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker +from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.floorplan import Floorplan +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.util.basic import make_build_dir +from finn.transformation.infer_data_layouts import InferDataLayouts + + +def _check_vitis_envvars(): + assert "VITIS_PATH" in os.environ, "VITIS_PATH must be set for Vitis" + assert ( + "PLATFORM_REPO_PATHS" in os.environ + ), "PLATFORM_REPO_PATHS must be set for Vitis" + assert ( + "XILINX_XRT" in os.environ + ), "XILINX_XRT must be set for Vitis, ensure the XRT env is sourced" + + +class CreateVitisXO(Transformation): + """Create a Vitis object file from a stitched FINN ip. + + Outcome if successful: sets the vitis_xo attribute in the ONNX + ModelProto's metadata_props field with the name of the object file as value. + The object file can be found under the ip subdirectory. + """ + + def __init__(self, ip_name="finn_design"): + super().__init__() + self.ip_name = ip_name + + def apply(self, model): + _check_vitis_envvars() + vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj") + stitched_ip_dir = vivado_proj_dir + "/ip" + args_string = [] + m_axis_idx = 0 + s_axis_idx = 0 + # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface + # developed from instructions in UG1393 (v2019.2) and package_xo documentation + # package_xo is responsible for generating the kernel xml + for node in model.graph.node: + node_inst = getCustomOp(node) + arg_id = 0 + if node.op_type == "TLastMarker": + stream_width = node_inst.get_nodeattr("StreamWidth") + # add a stream input or output port, based on direction + if node_inst.get_nodeattr("Direction") == "in": + args_string.append( + "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint<%s>:0}" + % (str(arg_id), s_axis_idx, str(stream_width)) + ) + s_axis_idx += 1 + else: + args_string.append( + "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint<%s>:0}" + % (str(arg_id), m_axis_idx, str(stream_width)) + ) + m_axis_idx += 1 + arg_id += 1 + # add a axilite port if dynamic + # add a count parameter if dynamic + if node_inst.get_nodeattr("DynIters") == 1: + args_string.append( + "{numReps:0:%s:s_axi_control:0x4:0x10:uint:0}" % str(arg_id) + ) + arg_id += 1 + elif node.op_type == "IODMA": + port_width = node_inst.get_nodeattr("intfWidth") + # add an address parameter + # add a count parameter + args_string.append( + "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint<%s>*:0}" + % (str(arg_id), str(port_width)) + ) + arg_id += 1 + args_string.append( + "{numReps:0:%s:s_axi_control:0x4:0x1C:uint:0}" % str(arg_id) + ) + arg_id += 1 + + # save kernel xml then run package_xo + xo_name = self.ip_name + ".xo" + xo_path = vivado_proj_dir + "/" + xo_name + model.set_metadata_prop("vitis_xo", xo_path) + + # generate the package_xo command in a tcl script + package_xo_string = ( + "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s" + % (xo_path, self.ip_name, stitched_ip_dir) + ) + for arg in args_string: + package_xo_string += " -kernel_xml_args " + arg + with open(vivado_proj_dir + "/gen_xo.tcl", "w") as f: + f.write(package_xo_string) + + # create a shell script and call Vivado + package_xo_sh = vivado_proj_dir + "/gen_xo.sh" + working_dir = os.environ["PWD"] + with open(package_xo_sh, "w") as f: + f.write("#!/bin/bash \n") + f.write("cd {}\n".format(vivado_proj_dir)) + f.write("vivado -mode batch -source gen_xo.tcl\n") + f.write("cd {}\n".format(working_dir)) + bash_command = ["bash", package_xo_sh] + process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) + process_compile.communicate() + assert os.path.isfile(xo_path), ( + "Vitis .xo file not created, check logs under %s" % vivado_proj_dir + ) + return (model, False) + + +class VitisLink(Transformation): + """Create an XCLBIN with Vitis. + + Outcome if successful: sets the vitis_xclbin attribute in the ONNX + ModelProto's metadata_props field with the XCLBIN full path as value. + """ + + def __init__(self, platform, f_mhz=200): + super().__init__() + self.platform = platform + self.f_mhz = f_mhz + + def apply(self, model): + _check_vitis_envvars() + # create a config file and empty list of xo files + config = ["[connectivity]"] + object_files = [] + idma_idx = 0 + odma_idx = 0 + instance_names = {} + for node in model.graph.node: + assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" + sdp_node = getCustomOp(node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + kernel_xo = kernel_model.get_metadata_prop("vitis_xo") + object_files.append(kernel_xo) + # gather info on connectivity + # assume each node connected to outputs/inputs is DMA: + # has axis, aximm and axilite + # everything else is axis-only + # assume only one connection from each ip to the next + # all aximm allocated to DDR[0] + # all kernels allocated to SLR0 + producer = model.find_producer(node.input[0]) + consumer = model.find_consumers(node.output[0]) + # define kernel instances + # name kernels connected to graph inputs as idmaxx + # name kernels connected to graph inputs as odmaxx + if producer is None: + instance_names[node.name] = "idma" + str(idma_idx) + config.append("nk=%s:1:%s" % (node.name, instance_names[node.name])) + idma_idx += 1 + elif consumer is None: + instance_names[node.name] = "odma" + str(odma_idx) + config.append("nk=%s:1:%s" % (node.name, instance_names[node.name])) + odma_idx += 1 + else: + instance_names[node.name] = node.name + config.append("nk=%s:1:%s" % (node.name, instance_names[node.name])) + # assign SLRs + config.append("slr=%s:SLR0" % instance_names[node.name]) + # assign memory banks + if producer is None or consumer is None: + config.append( + "sp=%s.m_axi_gmem0:DDR[%d]" % (instance_names[node.name], 0) + ) + # connect streams + if producer is not None: + for i in range(len(node.input)): + producer = model.find_producer(node.input[i]) + if producer is not None: + j = list(producer.output).index(node.input[i]) + config.append( + "stream_connect=%s.m_axis_%d:%s.s_axis_%d" + % ( + instance_names[producer.name], + j, + instance_names[node.name], + i, + ) + ) + + # create a temporary folder for the project + link_dir = make_build_dir(prefix="vitis_link_proj_") + model.set_metadata_prop("vitis_link_proj", link_dir) + + config = "\n".join(config) + "\n" + with open(link_dir + "/config.txt", "w") as f: + f.write(config) + + # create a shell script and call Vitis + script = link_dir + "/run_vitis_link.sh" + working_dir = os.environ["PWD"] + with open(script, "w") as f: + f.write("#!/bin/bash \n") + f.write("cd {}\n".format(link_dir)) + f.write( + "v++ -t hw --platform %s --link %s" + " --kernel_frequency %d --config config.txt --optimize 2" + " --save-temps -R2\n" + % (self.platform, " ".join(object_files), self.f_mhz) + ) + f.write("cd {}\n".format(working_dir)) + bash_command = ["bash", script] + process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) + process_compile.communicate() + # TODO rename xclbin appropriately here? + xclbin = link_dir + "/a.xclbin" + assert os.path.isfile(xclbin), ( + "Vitis .xclbin file not created, check logs under %s" % link_dir + ) + model.set_metadata_prop("vitis_xclbin", xclbin) + return (model, False) + + +class VitisBuild(Transformation): + """Best-effort attempt at building the accelerator with Vitis.""" + + def __init__(self, fpga_part, period_ns, platform): + super().__init__() + self.fpga_part = fpga_part + self.period_ns = period_ns + self.platform = platform + + def apply(self, model): + _check_vitis_envvars() + # first infer layouts + model = model.transform(InferDataLayouts()) + # prepare at global level, then break up into kernels + prep_transforms = [ + MakePYNQDriver(platform="alveo"), + InsertIODMA(512), + InsertDWC(), + Floorplan(), + CreateDataflowPartition(), + ] + for trn in prep_transforms: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + # Build each kernel individually + sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition") + for sdp_node in sdp_nodes: + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform( + InsertTLastMarker(both=True, external=False, dynamic=False) + ) + kernel_model = kernel_model.transform(GiveUniqueNodeNames()) + kernel_model.save(dataflow_model_filename) + kernel_model = kernel_model.transform( + PrepareIP(self.fpga_part, self.period_ns) + ) + kernel_model = kernel_model.transform(HLSSynthIP()) + kernel_model = kernel_model.transform(ReplaceVerilogRelPaths()) + kernel_model = kernel_model.transform( + CreateStitchedIP( + self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True + ) + ) + kernel_model = kernel_model.transform( + CreateVitisXO(sdp_node.onnx_node.name) + ) + kernel_model.save(dataflow_model_filename) + # Assemble design from kernels + model = model.transform(VitisLink(self.platform, round(1000 / self.period_ns))) + # set platform attribute for correct remote execution + model.set_metadata_prop("platform", "alveo") + + return (model, False) diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py index 4303eb17f39a9949f5729e895e449bbb6a633033..8ad59d2baf3015cfebffeff88a059f48d9428371 100644 --- a/src/finn/transformation/general.py +++ b/src/finn/transformation/general.py @@ -81,14 +81,19 @@ class RemoveStaticGraphInputs(Transformation): class GiveUniqueNodeNames(Transformation): - """Give unique names to each node in the graph using enumeration.""" + """Give unique names to each node in the graph using enumeration, starting + with given prefix (if specified in the constructor).""" + + def __init__(self, prefix=""): + super().__init__() + self.prefix = prefix def apply(self, model): optype_count = {} for n in model.graph.node: if n.op_type not in optype_count.keys(): optype_count[n.op_type] = 0 - n.name = "%s_%d" % (n.op_type, optype_count[n.op_type]) + n.name = "%s%s_%d" % (self.prefix, n.op_type, optype_count[n.op_type]) optype_count[n.op_type] += 1 # return model_was_changed = False as single iteration is always enough return (model, False) @@ -189,6 +194,9 @@ class SortGraph(Transformation): # Probably this is faster than copying initializers and more robust in general def apply(self, model): + if len(model.graph.node) == 1: + # single-node graph, nothing to sort + return (model, False) # Gather graph structure graph_dependencies = {} node_list = [ @@ -214,7 +222,7 @@ class SortGraph(Transformation): for new_idx, sorted_idx in enumerate(sorted_node_indexes): model.graph.node.insert(new_idx, node_list[sorted_idx]) - return model, False + return (model, False) class ConvertSubToAdd(Transformation): diff --git a/src/finn/transformation/lower_convs_to_matmul.py b/src/finn/transformation/lower_convs_to_matmul.py index aa231a43a3865a161a501b4997ff2f538800554f..e5a1f778d0cac48925ecd97ae8b970f7bdab9c4f 100644 --- a/src/finn/transformation/lower_convs_to_matmul.py +++ b/src/finn/transformation/lower_convs_to_matmul.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np from onnx import TensorProto from onnx import helper @@ -54,12 +55,34 @@ class LowerConvsToMatMul(Transformation): k = get_by_name(n.attribute, "kernel_shape").ints[-1] pad = get_by_name(n.attribute, "pads").ints[-1] stride = get_by_name(n.attribute, "strides").ints[-1] + group = get_by_name(n.attribute, "group").i weight_name = n.input[1] W_conv = model.get_initializer(weight_name) - ifm_ch = W_conv.shape[1] - ofm_ch = W_conv.shape[0] + ifm_ch = model.get_tensor_shape(n.input[0])[1] # assume NCHW + ofm_ch = model.get_tensor_shape(n.output[0])[1] # assume NCHW ifm_dim = model.get_tensor_shape(n.input[0])[-1] # assume NCHW ofm_dim = model.get_tensor_shape(n.output[0])[-1] # assume NCHW + + # if depthwise conv create sparse matrix and variable "dw" + # to store as attribute in Im2Col that indicates that the created + # Im2Col node belongs to a depthwise convolution + dw = False + if group == ifm_ch and ofm_ch == ifm_ch: + W_sparse = np.zeros((ofm_ch, ifm_ch, k, k)) + for ch in range(ifm_ch): + W_sparse[ch][ch] = W_conv[ch][0] + W_conv = W_sparse.astype(np.float32) + # we need to store information of the + # sparsity of the weight matrix. For this + # we use the sparsity annotation of the + # weight tensor + sparsity = {"dw": {"kernel_shape": k}} + model.set_tensor_sparsity(weight_name, sparsity) + # additionally create variable "dw" to store + # as attribute in Im2Col that indicates that the created + # Im2Col node belongs to a depthwise convolution + dw = True + # reuse conv weights for new matmul weights # conv weights are [OFM][IFM][k][k] # first convert to [OFM][k][k][IFM] (to remain compatible with @@ -70,6 +93,7 @@ class LowerConvsToMatMul(Transformation): # transpose to get ONNX-compatible [k*k*IFM][OFM] matrix W_matmul = W_matmul.T model.set_initializer(weight_name, W_matmul) + # create new intermediate values inp_trans_out = helper.make_tensor_value_info( model.make_new_valueinfo_name(), @@ -121,6 +145,7 @@ class LowerConvsToMatMul(Transformation): kernel_size=k, pad_amount=pad, input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch), + depthwise=dw, ) # do matmul diff --git a/src/finn/transformation/merge_onnx_models.py b/src/finn/transformation/merge_onnx_models.py index 5dc6127ed189311c72a119932394aca4745e3608..ceacab197150fe6d32e3a9eda268aed186b1a8bc 100644 --- a/src/finn/transformation/merge_onnx_models.py +++ b/src/finn/transformation/merge_onnx_models.py @@ -31,12 +31,12 @@ from onnx import helper from finn.transformation import Transformation from finn.core.modelwrapper import ModelWrapper -import finn.util.basic as util from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes from finn.transformation.infer_data_layouts import InferDataLayouts from finn.transformation.general import ( GiveReadableTensorNames, + GiveRandomTensorNames, GiveUniqueNodeNames, GiveUniqueParameterTensors, ) @@ -59,6 +59,9 @@ class MergeONNXModels(Transformation): graph_modified = False pre_model = self.pre_model post_model = copy.deepcopy(model) + # to avoid mix-ups, start by giving all tensors random names + pre_model = pre_model.transform(GiveRandomTensorNames()) + post_model = post_model.transform(GiveRandomTensorNames()) # check for dynamic outputs of pre model dyn_outp = [] @@ -94,27 +97,6 @@ class MergeONNXModels(Transformation): for n in post_model.graph.node: n.name = "" - # randomize all tensor names - names1 = pre_model.get_all_tensor_names() - names2 = post_model.get_all_tensor_names() - used_names = names1 + names2 - - # pre_model - for tensor_name in names1: - new_name = util.random_string() - while new_name in used_names: - new_name = util.random_string() - pre_model.rename_tensor(tensor_name, new_name) - used_names.append(new_name) - - # post_model - for tensor in names2: - new_name = util.random_string() - while new_name in used_names: - new_name = util.random_string() - post_model.rename_tensor(tensor_name, new_name) - used_names.append(new_name) - # check if models can be merged output_model_a = dyn_outp[0].name input_model_b = dyn_inp[0].name @@ -124,6 +106,9 @@ class MergeONNXModels(Transformation): output_a_shape == input_b_shape ), "Models can't be merged! Shapes don't match." + pre_model.save("pre.onnx") + post_model.save("post.onnx") + # connect output of one model to input of the other for n in pre_model.graph.node: if output_model_a == n.output[0]: @@ -132,83 +117,43 @@ class MergeONNXModels(Transformation): # extract information for new model # nodes - node_list_a = pre_model.graph.node - node_list_b = post_model.graph.node - - node_list = node_list_a - for node in node_list_b: - node_list.append(node) + node_pre = [node for node in pre_model.graph.node] + node_post = [node for node in post_model.graph.node] + node_new = node_pre + node_post # in and output inp = pre_model.graph.input[0] outp = post_model.graph.output[0] + vi_pre = [x for x in pre_model.graph.value_info] + out_pre = [x for x in pre_model.graph.output] + qa_pre = [x for x in pre_model.graph.quantization_annotation] + init_pre = [x for x in pre_model.graph.initializer] + + vi_post = [x for x in post_model.graph.value_info] + qa_post = [x for x in post_model.graph.quantization_annotation] + init_post = [x for x in post_model.graph.initializer] + + vi_new = vi_pre + vi_post + out_pre + qa_new = qa_pre + qa_post + init_new = init_pre + init_post + # create new graph and model new_graph = helper.make_graph( - nodes=node_list, + nodes=node_new, name="fuse-graph", inputs=[inp], outputs=[outp], - value_info=[], + value_info=vi_new, ) new_model = helper.make_model(new_graph, producer_name="fuse_model") new_model = ModelWrapper(new_model) - # add value info from both models to new model - # pre model - vi_pre = [x for x in pre_model.graph.input] - vi_pre += [x for x in pre_model.graph.output] - vi_pre += [x for x in pre_model.graph.value_info] - for vi in vi_pre: - # preserve intializers, quantization/sparsity annotation, etc. - # initializer - init_val = pre_model.get_initializer(vi.name) - if init_val is not None: - new_model.set_initializer(vi.name, init_val) - # FINN datatype - dtype = pre_model.get_tensor_datatype(vi.name) - new_model.set_tensor_datatype(vi.name, dtype) - # data layout - data_layout = pre_model.get_tensor_layout(vi.name) - if data_layout is not None: - new_model.set_tensor_layout(vi.name, data_layout) - # sparsity - sparsity = pre_model.get_tensor_sparsity(vi.name) - if sparsity is not None: - new_model.set_tensor_sparsity(vi.name, sparsity) - # graph input should not be part of graph.value_info, so don't insert - # if current vi == inp, but the quantization annotation is preserved - if vi == inp: - continue - new_model.graph.value_info.append(vi) - - # post model - vi_model = [x for x in post_model.graph.input] - vi_model += [x for x in post_model.graph.output] - vi_model += [x for x in post_model.graph.value_info] - for vi in vi_model: - # preserve intializers, quantization/sparsity annotation, etc. - # initializer - init_val = post_model.get_initializer(vi.name) - if init_val is not None: - new_model.set_initializer(vi.name, init_val) - # FINN datatype - dtype = post_model.get_tensor_datatype(vi.name) - new_model.set_tensor_datatype(vi.name, dtype) - # data layout - data_layout = post_model.get_tensor_layout(vi.name) - if data_layout is not None: - new_model.set_tensor_layout(vi.name, data_layout) - # sparsity - sparsity = post_model.get_tensor_sparsity(vi.name) - if sparsity is not None: - new_model.set_tensor_sparsity(vi.name, sparsity) - # graph output should not be part of graph.value_info, so don't insert - # if current vi == outp, but the quantization annotation is preserved - if vi == outp: - continue - new_model.graph.value_info.append(vi) + for i in init_new: + new_model.graph.initializer.append(i) + for qa in qa_new: + new_model.graph.quantization_annotation.append(qa) # tidy-up new model model = new_model diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py index 2ddaf4f840f449d3f5ec5cb83eaf461d624eb7a2..9943d371dad79a977b61810bcddafdcba505d6cc 100644 --- a/src/finn/transformation/move_reshape.py +++ b/src/finn/transformation/move_reshape.py @@ -36,5 +36,15 @@ class RemoveCNVtoFCFlatten(Transformation): graph_modified = True consumer.input[0] = n.input[0] graph.node.remove(n) + elif producer.op_type == "Transpose": + transp_node = producer + producer = model.find_producer(transp_node.input[0]) + if _is_fpgadataflow_node(producer) is True: + consumer = model.find_consumer(n.output[0]) + if _is_fpgadataflow_node(consumer) is True: + graph_modified = True + consumer.input[0] = transp_node.input[0] + graph.node.remove(n) + graph.node.remove(transp_node) return (model, graph_modified) diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index c33281d85449c173a4631297fd1d67ac0aed8c81..8626ef40619b067c6672c9017ddcb747998c3f2c 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -51,10 +51,20 @@ class RoundAndClipThresholds(Transformation): model.set_tensor_datatype(n.input[1], idtype) graph_modified = True if idtype.is_integer() and not idtype.signed() and (Tnew < 0).any(): - # clip any negative thresholds + # clip any negative thresholds if input is unsigned Tnew = np.clip(Tnew, 0, None) model.set_initializer(n.input[1], Tnew) # use same datatype as inputs for thresholds model.set_tensor_datatype(n.input[1], idtype) graph_modified = True + if idtype.is_integer() and ( + (Tnew < (idtype.min() - 1)).any() + or (Tnew > (idtype.max() + 1)).any() + ): + # clip any large thresholds to input range + 1 + Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1) + model.set_initializer(n.input[1], Tnew) + # use same datatype as inputs for thresholds + model.set_tensor_datatype(n.input[1], idtype) + graph_modified = True return (model, graph_modified) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 4a8277e08d3fc21e0b20668edf2ecad947b36647..cc759bebb1b856a84e25978d442e460332092d23 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -42,6 +42,7 @@ pynq_part_map = dict() pynq_part_map["Ultra96"] = "xczu3eg-sbva484-1-e" pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1" pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1" +pynq_part_map["ZCU102"] = "xczu9eg-ffvb1156-2-e" pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e" # native AXI HP port width (in bits) for PYNQ boards @@ -49,8 +50,22 @@ pynq_native_port_width = dict() pynq_native_port_width["Pynq-Z1"] = 64 pynq_native_port_width["Pynq-Z2"] = 64 pynq_native_port_width["Ultra96"] = 128 +pynq_native_port_width["ZCU102"] = 128 pynq_native_port_width["ZCU104"] = 128 +# Alveo device and platform mappings +alveo_part_map = dict() +alveo_part_map["U50"] = "xcu50-fsvh2104-2L-e" +alveo_part_map["U200"] = "xcu200-fsgd2104-2-e" +alveo_part_map["U250"] = "xcu250-figd2104-2L-e" +alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e" + +alveo_default_platform = dict() +alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3" +alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2" +alveo_default_platform["U250"] = "xilinx_u250_xdma_201830_2" +alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3" + def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable @@ -141,13 +156,19 @@ def make_build_dir(prefix=""): def get_by_name(container, name, name_field="name"): - """Return item from container by .name field if it exists, None otherwise""" + """Return item from container by .name field if it exists, None otherwise. + Will throw an Exception if multiple items are found, since this violates the + ONNX standard.""" names = [getattr(x, name_field) for x in container] - try: - ind = names.index(name) - return container[ind] - except ValueError: + + inds = [i for i, e in enumerate(names) if e == name] + if len(inds) > 1: + raise Exception("Found multiple get_by_name matches, undefined behavior") + elif len(inds) == 0: return None + else: + ind = inds[0] + return container[ind] def remove_by_name(container, name, name_field="name"): @@ -244,6 +265,33 @@ def pad_tensor_to_multiple_of(ndarray, pad_to_dims, val=0, distr_pad=False): return ret +def calculate_matvec_accumulator_range(matrix, vec_dt): + """Calculate the minimum and maximum possible result (accumulator) values + for a dot product x * A, given matrix A of dims (MW, MH), and vector (1, MW) + with datatype vec_dt. Returns (acc_min, acc_max). + """ + min_weight = matrix.min() + max_weight = matrix.max() + perceptive_field_elems = matrix.shape[0] + min_input = vec_dt.min() + max_input = vec_dt.max() + # calculate minimum and maximum values of accumulator + # assume inputs span the whole range of the input datatype + acc_min = perceptive_field_elems * min( + min_weight * max_input, + min_weight * min_input, + max_weight * max_input, + max_weight * min_input, + ) + acc_max = perceptive_field_elems * max( + min_weight * max_input, + min_weight * min_input, + max_weight * max_input, + max_weight * min_input, + ) + return (acc_min, acc_max) + + def gen_finn_dt_tensor(finn_dt, tensor_shape): """Generates random tensor in given shape and with given FINN DataType.""" if type(tensor_shape) == list: diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py index d9e244422065314ceb790dc6719b57688ff76828..a4400f7bd7e75549189f081ce255fd67c49b3746 100644 --- a/src/finn/util/vcd.py +++ b/src/finn/util/vcd.py @@ -162,16 +162,23 @@ def _get_stats(x): return (x[0], get_stream_if_stats(x[1], x[0])) -def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}"): +def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}", num_workers=None): """Return a list of streaming interface stats, sorted by the percentage - for the given sort_by key. If stream_ifs is None, all streamin interface + for the given sort_by key. If stream_ifs is None, all streaming interface stats will be returned, otherwise treated as a list of interface names to - return the stats for.""" + return the stats for. + By default the number of parallel workers from the environment variable + NUM_DEFAULT_WORKERS will be used. This behavior can be changed on a per + call basis by supplying the optional parameter: num_workers + """ if stream_ifs is None: stream_ifs = list_stream_if(vcd_file) - with mp.Pool(get_num_default_workers()) as p: + if num_workers is None: + num_workers = get_num_default_workers() + + with mp.Pool(num_workers) as p: stream_ifs = map(lambda x: (x, vcd_file), stream_ifs) all_stats = p.map(_get_stats, stream_ifs) diff --git a/tests/brevitas/test_brevitas_QConv2d.py b/tests/brevitas/test_brevitas_QConv2d.py new file mode 100644 index 0000000000000000000000000000000000000000..198f1e7961a9e160589989b8b34b45b5fda53817 --- /dev/null +++ b/tests/brevitas/test_brevitas_QConv2d.py @@ -0,0 +1,76 @@ +import pytest +import os +import numpy as np +import torch +import brevitas.onnx as bo +from brevitas.nn import QuantConv2d +from brevitas.core.restrict_val import RestrictValueType +from brevitas.core.quant import QuantType +from brevitas.core.scaling import ScalingImplType +from brevitas.core.stats import StatsOp + +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +import finn.core.onnx_exec as oxe +from finn.transformation.infer_shapes import InferShapes +from finn.util.basic import gen_finn_dt_tensor + +export_onnx_path = "test_brevitas_conv.onnx" + + +@pytest.mark.parametrize("dw", [False, True]) +@pytest.mark.parametrize("in_channels", [32]) +def test_brevitas_QConv2d(dw, in_channels): + ishape = (1, 32, 111, 111) + if dw is True: + groups = in_channels + out_channels = in_channels + kernel_size = 3 + padding = 1 + stride = 1 + w_shape = (32, 1, 3, 3) + + else: + groups = 1 + out_channels = 64 + kernel_size = 1 + padding = 0 + stride = 1 + w_shape = (64, 32, 1, 1) + + b_conv = QuantConv2d( + in_channels=in_channels, + out_channels=out_channels, + groups=groups, + kernel_size=kernel_size, + padding=padding, + stride=stride, + bias=False, + bias_quant_type=QuantType.FP, + compute_output_bit_width=False, + compute_output_scale=False, + weight_bit_width=4, + weight_quant_type=QuantType.INT, + weight_scaling_impl_type=ScalingImplType.STATS, + weight_scaling_stats_op=StatsOp.MAX, + weight_scaling_per_output_channel=True, + weight_restrict_scaling_type=RestrictValueType.LOG_FP, + weight_narrow_range=True, + weight_scaling_min_val=2e-16, + ) + weight_tensor = gen_finn_dt_tensor(DataType.INT4, w_shape) + b_conv.weight = torch.nn.Parameter(torch.from_numpy(weight_tensor).float()) + + bo.export_finn_onnx(b_conv, ishape, export_onnx_path) + model = ModelWrapper(export_onnx_path) + model = model.transform(InferShapes()) + inp_tensor = np.random.uniform(low=-1.0, high=1.0, size=ishape).astype(np.float32) + idict = {model.graph.input[0].name: inp_tensor} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + inp_tensor = torch.from_numpy(inp_tensor).float() + b_conv.eval() + expected = b_conv.forward(inp_tensor).detach().numpy() + + assert np.isclose(produced, expected, atol=1e-3).all() + os.remove(export_onnx_path) diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py index a2cfcd3a864c12788c2ac73271b5782ddfa336c1..f931f91c89f738899ff9e6584be81a3b2d542227 100644 --- a/tests/end2end/test_end2end_cnv_w1a1.py +++ b/tests/end2end/test_end2end_cnv_w1a1.py @@ -78,6 +78,7 @@ from finn.transformation.fpgadataflow.annotate_resources import AnnotateResource from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.core.throughput_test import throughput_test_rtlsim +import warnings build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -317,12 +318,16 @@ def test_end2end_cnv_w1a1_synth_pynq_project(): ) model = model.transform(SynthPYNQProject()) model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) model.save(build_dir + "/end2end_cnv_w1a1_synth.onnx") def test_end2end_cnv_w1a1_make_driver(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_synth.onnx") - model = model.transform(MakePYNQDriver()) + model = model.transform(MakePYNQDriver(platform="zynq")) model.save(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx") diff --git a/tests/end2end/test_end2end_cnv_w2a2.py b/tests/end2end/test_end2end_cnv_w2a2.py index f45b0a3eccd2f52ea144405865a1df06315952d9..239094a3c931c16b3afe8d1874345e4dc90334ef 100644 --- a/tests/end2end/test_end2end_cnv_w2a2.py +++ b/tests/end2end/test_end2end_cnv_w2a2.py @@ -77,6 +77,7 @@ from finn.transformation.fpgadataflow.annotate_resources import AnnotateResource from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.core.throughput_test import throughput_test_rtlsim +import warnings build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -315,12 +316,16 @@ def test_end2end_cnv_w2a2_synth_pynq_project(): ) model = model.transform(SynthPYNQProject()) model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) model.save(build_dir + "/end2end_cnv_w2a2_synth.onnx") def test_end2end_cnv_w2a2_make_driver(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_synth.onnx") - model = model.transform(MakePYNQDriver()) + model = model.transform(MakePYNQDriver(platform="zynq")) model.save(build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx") diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py index 31659df631e8ab489cb63dbef51200f313bca6b3..1a3cc4f1bb9232809e864bb0c784498534f63631 100644 --- a/tests/end2end/test_end2end_tfc_w1a1.py +++ b/tests/end2end/test_end2end_tfc_w1a1.py @@ -79,6 +79,7 @@ from finn.transformation.fpgadataflow.annotate_resources import AnnotateResource from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.core.throughput_test import throughput_test_rtlsim import finn.util.vcd as vcd +import warnings build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -241,11 +242,11 @@ def test_end2end_tfc_w1a1_throughput_test_rtlsim(): # run through IP-stitched rtlsim with increasing batch sizes and # check the number of cycles it takes to execute ret = throughput_test_rtlsim(model, 1) - assert ret["cycles"] == 205 + assert np.isclose(ret["cycles"], 205, atol=5) ret = throughput_test_rtlsim(model, 10) - assert ret["cycles"] == 844 + assert np.isclose(ret["cycles"], 844, atol=10) ret = throughput_test_rtlsim(model, 100) - assert ret["cycles"] == 7234 + assert np.isclose(ret["cycles"], 7234, atol=100) @pytest.mark.vivado @@ -314,12 +315,16 @@ def test_end2end_tfc_w1a1_synth_pynq_project(): ) model = model.transform(SynthPYNQProject()) model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) model.save(build_dir + "/end2end_tfc_w1a1_synth.onnx") def test_end2end_tfc_w1a1_make_driver(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_synth.onnx") - model = model.transform(MakePYNQDriver()) + model = model.transform(MakePYNQDriver(platform="zynq")) model.save(build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx") diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py index d5579f625a20ae26e18bcdcba0cfaa3042a71b9a..0f066cb06c53ce118d0a357fce0999299d7f3305 100644 --- a/tests/end2end/test_end2end_tfc_w1a2.py +++ b/tests/end2end/test_end2end_tfc_w1a2.py @@ -74,6 +74,7 @@ from finn.util.basic import pynq_part_map from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +import warnings build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -270,12 +271,16 @@ def test_end2end_tfc_w1a2_synth_pynq_project(): ) model = model.transform(SynthPYNQProject()) model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) model.save(build_dir + "/end2end_tfc_w1a2_synth.onnx") def test_end2end_tfc_w1a2_make_driver(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_synth.onnx") - model = model.transform(MakePYNQDriver()) + model = model.transform(MakePYNQDriver(platform="zynq")) model.save(build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx") diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py index 470119f3444987f0156caff81bf556bf4f2f2cbb..6eb613fc877b6e6801140f2a03c3a9509c08c0cb 100644 --- a/tests/end2end/test_end2end_tfc_w2a2.py +++ b/tests/end2end/test_end2end_tfc_w2a2.py @@ -74,6 +74,7 @@ from finn.util.basic import pynq_part_map from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +import warnings build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -270,12 +271,16 @@ def test_end2end_tfc_w2a2_synth_pynq_project(): ) model = model.transform(SynthPYNQProject()) model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) model.save(build_dir + "/end2end_tfc_w2a2_synth.onnx") def test_end2end_tfc_w2a2_make_driver(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_synth.onnx") - model = model.transform(MakePYNQDriver()) + model = model.transform(MakePYNQDriver(platform="zynq")) model.save(build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx") diff --git a/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py b/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py new file mode 100644 index 0000000000000000000000000000000000000000..a272fadc12f095034693e555e4d791e9e73262ab --- /dev/null +++ b/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py @@ -0,0 +1,251 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pytest +import numpy as np + +# as of Feb'20 there is a bug that segfaults ONNX shape inference if we +# import pytorch before onnx, so we make sure to import onnx first +import onnx # NOQA +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.streamline.absorb as absorb +from finn.core.onnx_exec import execute_onnx +from finn.custom_op.registry import getCustomOp +from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount +from finn.transformation.fold_constants import FoldConstants + +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.transformation.general import ( + RemoveUnusedTensors, + RemoveStaticGraphInputs, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline import Streamline +from finn.util.basic import pynq_part_map +from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip +from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources +from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild +import pkg_resources as pk +from finn.transformation.double_to_single_float import DoubleToSingleFloat +from finn.transformation.move_reshape import RemoveCNVtoFCFlatten +from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles +import warnings + + +build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 +mem_mode = "decoupled" + + +def test_end2end_zynqbuild_cnv_w1a1_export(): + import brevitas.onnx as bo + + tfc = get_test_model_trained("CNV", 1, 1) + bo.export_finn_onnx( + tfc, (1, 3, 32, 32), build_dir + "/end2end_zynqbuild_cnv_w1a1_export.onnx" + ) + + +def test_end2end_zynqbuild_cnv_w1a1_import_and_tidy(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_export.onnx" + ) + model = model.transform(DoubleToSingleFloat()) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(RemoveStaticGraphInputs()) + model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_tidy.onnx") + + +def test_end2end_zynqbuild_cnv_w1a1_streamline(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_tidy.onnx" + ) + model = model.transform(Streamline()) + model = model.transform(LowerConvsToMatMul()) + model = model.transform(MakeMaxPoolNHWC()) + model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(ConvertBipolarMatMulToXnorPopcount()) + model = model.transform(Streamline()) + model = model.transform(RemoveUnusedTensors()) + model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_streamlined.onnx") + + +def test_end2end_zynqbuild_cnv_w1a1_convert_to_hls_layers(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_streamlined.onnx" + ) + model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode)) + model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode)) + model = model.transform(to_hls.InferConvInpGen()) + model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(RemoveCNVtoFCFlatten()) + model = model.transform(InferDataLayouts()) + model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_hls_layers.onnx") + + +def test_end2end_zynqbuild_cnv_w1a1_create_dataflow_partition(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_hls_layers.onnx" + ) + parent_model = model.transform(CreateDataflowPartition()) + parent_model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_parent.onnx") + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename) + dataflow_model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_model.onnx") + + +def test_end2end_zynqbuild_cnv_w1a1_fold(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_model.onnx" + ) + fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") + # each tuple is (PE, SIMD, in_fifo_depth) for a layer + folding = [ + (16, 3, 256), + (32, 32, 256), + (16, 32, 256), + (16, 32, 256), + (4, 32, 214), + (1, 32, 2), + (1, 4, 126), + (1, 8, 62), + (5, 1, 6), + ] + for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding): + fcl_inst = getCustomOp(fcl) + fcl_inst.set_nodeattr("PE", pe) + fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("inFIFODepth", ififodepth) + + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_idepth = [2, 51, 9, 106, 2, 2] + for i in range(len(swg_layers)): + swg_inst = getCustomOp(swg_layers[i]) + simd = folding[i][1] + swg_inst.set_nodeattr("SIMD", simd) + swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i]) + model = model.transform(AnnotateResources("estimate")) + model = model.transform(AnnotateCycles()) + model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_folded.onnx") + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_zynqbuild_cnv_w1a1_build(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_folded.onnx" + ) + model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns)) + model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) + model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_build.onnx") + + +def test_end2end_zynqbuild_cnv_w1a1_deploy_on_pynq(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_build.onnx" + ) + try: + ip = os.environ["PYNQ_IP"] # no fault for this one; skip if not defined + if ip == "": + pytest.skip("PYNQ board IP address not specified") + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + # save the model to be able to link it to the parent + model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_pynq_deploy.onnx") + except KeyError: + pytest.skip("PYNQ board IP address not specified") + + +def test_end2end_zynqbuild_cnv_w1a1_run_on_pynq(): + # use the streamlined model as the "golden" model for right answers + golden = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_streamlined.onnx" + ) + iname = golden.graph.input[0].name + oname = golden.graph.output[0].name + # load one of the test vectors + fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz") + input_tensor = np.load(fn)["arr_0"].astype(np.float32) + input_tensor = input_tensor / 255 + assert input_tensor.shape == (1, 3, 32, 32) + x = input_tensor + # x = np.zeros(ishape, dtype=np.float32) + # run using FINN-based execution + ret_golden = execute_onnx(golden, {iname: x}, True) + y_golden = ret_golden[oname] + # set up parent+child graph to test + # we'll use models from the previous step as the child model + parent_model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_parent.onnx" + ) + iname = parent_model.graph.input[0].name + oname = parent_model.graph.output[0].name + try: + ip = os.environ["PYNQ_IP"] # NOQA + if ip == "": + pytest.skip("PYNQ board IP address not specified") + # produce results with cppsim + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_cnv_w1a1_pynq_deploy.onnx" + ) + sdp_node.set_nodeattr( + "model", build_dir + "/end2end_zynqbuild_cnv_w1a1_pynq_deploy.onnx" + ) + ret = execute_onnx(parent_model, {iname: x}, True) + y = ret[oname] + assert np.isclose(y, y_golden).all() + assert np.argmax(y) == 3 + + except KeyError: + pytest.skip("PYNQ board IP address not specified") diff --git a/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py b/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py new file mode 100644 index 0000000000000000000000000000000000000000..8b298d5644d6d6cda038e8ca1757be7538ba9804 --- /dev/null +++ b/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py @@ -0,0 +1,229 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +from pkgutil import get_data + +import pytest + +import numpy as np + +# as of Feb'20 there is a bug that segfaults ONNX shape inference if we +# import pytorch before onnx, so we make sure to import onnx first +import onnx # NOQA +import onnx.numpy_helper as nph + +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.streamline.absorb as absorb +from finn.core.onnx_exec import execute_onnx +from finn.custom_op.registry import getCustomOp +from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount +from finn.transformation.fold_constants import FoldConstants + +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.transformation.general import ( + RemoveUnusedTensors, + RemoveStaticGraphInputs, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline import Streamline +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds +from finn.util.basic import pynq_part_map +from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip +from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild +import warnings + +build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 +mem_mode = "decoupled" + + +def test_end2end_zynqbuild_tfc_w1a1_export(): + import brevitas.onnx as bo + + tfc = get_test_model_trained("TFC", 1, 1) + bo.export_finn_onnx( + tfc, (1, 1, 28, 28), build_dir + "/end2end_zynqbuild_tfc_w1a1_export.onnx" + ) + + +def test_end2end_zynqbuild_tfc_w1a1_import_and_tidy(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_export.onnx" + ) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(RemoveStaticGraphInputs()) + model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_tidy.onnx") + + +def test_end2end_zynqbuild_tfc_w1a1_streamline(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_tidy.onnx" + ) + model = model.transform(Streamline()) + model = model.transform(RemoveUnusedTensors()) + model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_streamlined.onnx") + + +def test_end2end_zynqbuild_tfc_w1a1_convert_to_hls_layers(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_streamlined.onnx" + ) + model = model.transform(ConvertBipolarMatMulToXnorPopcount()) + model = model.transform(absorb.AbsorbAddIntoMultiThreshold()) + model = model.transform(absorb.AbsorbMulIntoMultiThreshold()) + model = model.transform(RoundAndClipThresholds()) + model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode)) + model = model.transform(InferDataLayouts()) + model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_hls_layers.onnx") + + +def test_end2end_zynqbuild_tfc_w1a1_create_dataflow_partition(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_hls_layers.onnx" + ) + parent_model = model.transform(CreateDataflowPartition()) + parent_model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_parent.onnx") + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename) + dataflow_model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_model.onnx") + + +def test_end2end_zynqbuild_tfc_w1a1_fold(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_model.onnx" + ) + fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") + # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer + config = [ + (16, 49, 16, 64, "block"), + (8, 8, 64, 64, "auto"), + (8, 8, 64, 64, "auto"), + (10, 8, 64, 10, "distributed"), + ] + for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config): + fcl_inst = getCustomOp(fcl) + fcl_inst.set_nodeattr("PE", pe) + fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("inFIFODepth", ififo) + fcl_inst.set_nodeattr("outFIFODepth", ofifo) + fcl_inst.set_nodeattr("ram_style", ramstyle) + + model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_folded.onnx") + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_zynqbuild_tfc_w1a1_build(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_folded.onnx" + ) + model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns)) + model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) + model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_build.onnx") + + +def test_end2end_zynqbuild_tfc_w1a1_deploy_on_pynq(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_build.onnx" + ) + try: + ip = os.environ["PYNQ_IP"] # no fault for this one; skip if not defined + if ip == "": + pytest.skip("PYNQ board IP address not specified") + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + # save the model to be able to link it to the parent + model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_pynq_deploy.onnx") + except KeyError: + pytest.skip("PYNQ board IP address not specified") + + +def test_end2end_zynqbuild_tfc_w1a1_run_on_pynq(): + # use the streamlined model as the "golden" model for right answers + golden = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_streamlined.onnx" + ) + iname = golden.graph.input[0].name + oname = golden.graph.output[0].name + raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb") + input_tensor = onnx.load_tensor_from_string(raw_i) + x = nph.to_array(input_tensor) + # x = np.zeros(ishape, dtype=np.float32) + # run using FINN-based execution + ret_golden = execute_onnx(golden, {iname: x}, True) + y_golden = ret_golden[oname] + # set up parent+child graph to test + # we'll use models from the previous step as the child model + parent_model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_parent.onnx" + ) + iname = parent_model.graph.input[0].name + oname = parent_model.graph.output[0].name + try: + ip = os.environ["PYNQ_IP"] # NOQA + if ip == "": + pytest.skip("PYNQ board IP address not specified") + # produce results with cppsim + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w1a1_pynq_deploy.onnx" + ) + sdp_node.set_nodeattr( + "model", build_dir + "/end2end_zynqbuild_tfc_w1a1_pynq_deploy.onnx" + ) + ret = execute_onnx(parent_model, {iname: x}, True) + y = ret[oname] + assert np.isclose(y, y_golden).all() + + except KeyError: + pytest.skip("PYNQ board IP address not specified") diff --git a/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py b/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py new file mode 100644 index 0000000000000000000000000000000000000000..bdb24d82dd639abe52aac9688b0b98430f72cabd --- /dev/null +++ b/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py @@ -0,0 +1,218 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +from pkgutil import get_data + +import pytest + +import numpy as np + +# as of Feb'20 there is a bug that segfaults ONNX shape inference if we +# import pytorch before onnx, so we make sure to import onnx first +import onnx # NOQA +import onnx.numpy_helper as nph +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.core.onnx_exec import execute_onnx +from finn.custom_op.registry import getCustomOp +from finn.transformation.fold_constants import FoldConstants +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.transformation.general import ( + RemoveUnusedTensors, + RemoveStaticGraphInputs, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline import Streamline +from finn.util.basic import pynq_part_map +from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip +from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources +from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild +import warnings + +build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 +mem_mode = "decoupled" + + +def test_end2end_zynqbuild_tfc_w2a2_export(): + import brevitas.onnx as bo + + tfc = get_test_model_trained("TFC", 2, 2) + bo.export_finn_onnx( + tfc, (1, 1, 28, 28), build_dir + "/end2end_zynqbuild_tfc_w2a2_export.onnx" + ) + + +def test_end2end_zynqbuild_tfc_w2a2_import_and_tidy(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_export.onnx" + ) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(RemoveStaticGraphInputs()) + model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_tidy.onnx") + + +def test_end2end_zynqbuild_tfc_w2a2_streamline(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_tidy.onnx" + ) + model = model.transform(Streamline()) + model = model.transform(RemoveUnusedTensors()) + model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_streamlined.onnx") + + +def test_end2end_zynqbuild_tfc_w2a2_convert_to_hls_layers(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_streamlined.onnx" + ) + model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode)) + model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_hls_layers.onnx") + + +def test_end2end_zynqbuild_tfc_w2a2_create_dataflow_partition(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_hls_layers.onnx" + ) + parent_model = model.transform(CreateDataflowPartition()) + parent_model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_parent.onnx") + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename) + dataflow_model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_model.onnx") + + +def test_end2end_zynqbuild_tfc_w2a2_fold(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_model.onnx" + ) + fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") + # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer + config = [ + (16, 49, 16, 64, "block"), + (8, 8, 64, 64, "auto"), + (8, 8, 64, 64, "auto"), + (10, 8, 64, 10, "distributed"), + ] + for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config): + fcl_inst = getCustomOp(fcl) + fcl_inst.set_nodeattr("PE", pe) + fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("inFIFODepth", ififo) + fcl_inst.set_nodeattr("outFIFODepth", ofifo) + fcl_inst.set_nodeattr("ram_style", ramstyle) + + model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_folded.onnx") + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_zynqbuild_tfc_w2a2_build(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_folded.onnx" + ) + model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns)) + model = model.transform(AnnotateResources("synth")) + warnings.warn( + "Post-synthesis resources (excluding shell): " + + model.get_metadata_prop("res_total_synth") + ) + model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_build.onnx") + + +def test_end2end_zynqbuild_tfc_w2a2_deploy_on_pynq(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_build.onnx" + ) + try: + ip = os.environ["PYNQ_IP"] # no fault for this one; skip if not defined + if ip == "": + pytest.skip("PYNQ board IP address not specified") + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + # save the model to be able to link it to the parent + model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_pynq_deploy.onnx") + except KeyError: + pytest.skip("PYNQ board IP address not specified") + + +def test_end2end_zynqbuild_tfc_w2a2_run_on_pynq(): + # use the streamlined model as the "golden" model for right answers + golden = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_streamlined.onnx" + ) + iname = golden.graph.input[0].name + oname = golden.graph.output[0].name + raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb") + input_tensor = onnx.load_tensor_from_string(raw_i) + x = nph.to_array(input_tensor) + # x = np.zeros(ishape, dtype=np.float32) + # run using FINN-based execution + ret_golden = execute_onnx(golden, {iname: x}, True) + y_golden = ret_golden[oname] + # set up parent+child graph to test + # we'll use models from the previous step as the child model + parent_model = load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_parent.onnx" + ) + iname = parent_model.graph.input[0].name + oname = parent_model.graph.output[0].name + try: + ip = os.environ["PYNQ_IP"] # NOQA + if ip == "": + pytest.skip("PYNQ board IP address not specified") + # produce results with cppsim + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + load_test_checkpoint_or_skip( + build_dir + "/end2end_zynqbuild_tfc_w2a2_pynq_deploy.onnx" + ) + sdp_node.set_nodeattr( + "model", build_dir + "/end2end_zynqbuild_tfc_w2a2_pynq_deploy.onnx" + ) + ret = execute_onnx(parent_model, {iname: x}, True) + y = ret[oname] + assert np.isclose(y, y_golden).all() + + except KeyError: + pytest.skip("PYNQ board IP address not specified") diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py index 22c356a5869b25fcc7ae3ef0164ed61b53ef232c..9be9c904b0be0a8c1ab2421590922ae6cf2e1295 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -23,6 +23,8 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.custom_op.im2col import compute_conv_output_dim +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer # conv_config kernel_size,stride, pad @@ -30,29 +32,36 @@ from finn.custom_op.im2col import compute_conv_output_dim @pytest.mark.parametrize( "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)] ) +@pytest.mark.parametrize("depthwise", [False, True]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(conv_config, exec_mode): +def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): kernel_size, stride, pad = conv_config np.random.seed(0) idt = DataType.UINT4 in_feature_dim = 7 in_chn = 16 - out_chn = 20 + + if depthwise is True: + group = out_chn = in_chn + conv_param_shape = [out_chn, 1, kernel_size, kernel_size] + else: + group = 1 + out_chn = 20 + conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size] out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) input_shape = [1, in_chn, in_feature_dim, in_feature_dim] output_shape = [1, out_chn, out_feature_dim, out_feature_dim] - conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size] conv_weight_dt = DataType.UINT4 conv_config = {} conv_config["dilations"] = [1, 1] - conv_config["group"] = 1 + conv_config["group"] = group conv_config["kernel_shape"] = [kernel_size, kernel_size] conv_config["pads"] = [pad, pad, pad, pad] conv_config["strides"] = [stride, stride] @@ -86,6 +95,18 @@ def test_convert_to_hls_conv_layer(conv_config, exec_mode): new_model = model.transform(LowerConvsToMatMul()) new_model = new_model.transform(to_hls.InferConvInpGen()) + if depthwise is True: + new_model = new_model.transform(to_hls.InferVVAU()) + else: + new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer()) + fc_node = new_model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0] + fc_inst = getCustomOp(fc_node) + mw = fc_inst.get_nodeattr("MW") + mh = fc_inst.get_nodeattr("MH") + pe_cands = list(filter(lambda x: mh % x == 0, range(2, mh + 1))) + simd_cands = list(filter(lambda x: mw % x == 0, range(2, mw + 1))) + fc_inst.set_nodeattr("PE", pe_cands[0]) + fc_inst.set_nodeattr("SIMD", simd_cands[0]) new_model = new_model.transform(GiveUniqueNodeNames()) new_model = new_model.transform(InferShapes()) @@ -110,3 +131,25 @@ def test_convert_to_hls_conv_layer(conv_config, exec_mode): assert oxe.compare_execution(model, new_model, inp_dict) if kernel_size == 1 and stride > 1 and pad == 0: assert new_model.graph.node[1].op_type == "DownSampler" + if exec_mode == "rtlsim": + node = new_model.get_nodes_by_op_type("DownSampler")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=11) + assert exp_cycles != 0 + + if pad == 1: + padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_inst = getCustomOp(padding_node) + assert padding_inst.get_nodeattr("SIMD") == in_chn + + if depthwise is True and exec_mode == "rtlsim": + node = new_model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=11) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py index d77065ad9396d0cc8dd57a39ed823fffcb30ee47..bd600c6c57d00d5fc03152f75b9f2f8c6beeeb2c 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py @@ -89,7 +89,6 @@ def test_convert_to_hls_layers_tfc_w1a1(): assert fc3.op_type == "StreamingFCLayer_Batch" assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] - os.remove(export_onnx_path) fc0w = getCustomOp(fc0) fc0w.set_nodeattr("SIMD", 784) @@ -123,6 +122,7 @@ def test_convert_to_hls_layers_tfc_w1a1(): # do forward pass in PyTorch/Brevitas expected = tfc.forward(input_tensor).detach().numpy() assert np.isclose(produced, expected, atol=1e-3).all() + os.remove(export_onnx_path) @pytest.mark.vivado diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py index aba973051cb14e3e428e4de72a57924884c831de..86409feffd120b1baeeee471415e93f29d9e655a 100644 --- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py +++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py @@ -44,6 +44,7 @@ from finn.transformation.general import GiveUniqueNodeNames from finn.custom_op.registry import getCustomOp from finn.util.basic import gen_finn_dt_tensor from finn.transformation.infer_shapes import InferShapes +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt): @@ -210,3 +211,11 @@ def test_convert_to_hls_pool_batch( assert len(new_model.graph.node) == 5 else: assert len(new_model.graph.node) == 1 + + if exec_mode == "rtlsim": + node = new_model.get_nodes_by_op_type("Pool_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py new file mode 100644 index 0000000000000000000000000000000000000000..f530926e46ac5c116c3f15688c7f2face7954a30 --- /dev/null +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -0,0 +1,249 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import onnx.helper as oh +from onnx import TensorProto +import numpy as np + +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.fpgadataflow.convert_to_hls_layers import ( + InferConvInpGen, + InferVVAU, +) +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + +import finn.core.onnx_exec as oxe +from finn.custom_op.im2col import compute_conv_output_dim +from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor +from finn.custom_op.registry import getCustomOp + +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) + + +def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): + + # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold) + ofm_ch = ifm_ch + ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad=padding) + + if act is None: + odt = DataType.INT32 + else: + odt = act + out_act = oh.make_tensor_value_info( + "out_act", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch] + ) + T = oh.make_tensor_value_info("T", TensorProto.FLOAT, [ofm_ch, 15]) + tdt = DataType.INT32 + thresh_node = oh.make_node( + "MultiThreshold", + domain="finn", + inputs=["outp", "T"], + outputs=["out_act"], + data_layout="NHWC", + out_dtype=odt.name, + out_scale=1.0, + out_bias=0.0, + ) + + # set up onnx model + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch] + ) + + W_sparse = oh.make_tensor_value_info( + "W_sparse", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch] + ) + + im2col_node = oh.make_node( + "Im2Col", + domain="finn", + inputs=["inp"], + outputs=["im2col_out"], + kernel_size=k, + stride=stride, + pad_amount=padding, + input_shape="(1, {}, {}, {})".format(ifm_dim, ifm_dim, ifm_ch), + depthwise=1, + ) + + matmul_node = oh.make_node( + "MatMul", inputs=["im2col_out", "W_sparse"], outputs=["outp"] + ) + + if act is None: + node_list = [im2col_node, matmul_node] + global_out = outp + value_info = [W_sparse] + else: + node_list = [im2col_node, matmul_node, thresh_node] + global_out = out_act + value_info = [W_sparse, T] + + graph = oh.make_graph( + nodes=node_list, + name="lowered_dw_cnv_graph", + inputs=[inp], + outputs=[global_out], + value_info=value_info, + ) + model = oh.make_model(graph, producer_name="lowered_dw_cnv-model") + model = ModelWrapper(model) + + # initialize model + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype(model.graph.output[0].name, odt) + model.set_tensor_datatype("W_sparse", wdt) + + w_tensor = gen_finn_dt_tensor(wdt, [ofm_ch, 1, k, k]) + # create sparse matrix + W_matrix = np.zeros((ofm_ch, ifm_ch, k, k)) + for ch in range(ifm_ch): + W_matrix[ch][ch] = w_tensor[ch][0] + W_matrix = W_matrix.astype(np.float32) + W_matrix = W_matrix.transpose(0, 2, 3, 1) + W_matrix = W_matrix.reshape(ofm_ch, ifm_ch * k * k) + + model.set_initializer("W_sparse", W_matrix.T) + sparsity = {"dw": {"kernel_shape": k}} + model.set_tensor_sparsity("W_sparse", sparsity) + + if act is not None: + (min, max) = calculate_signed_dot_prod_range(idt, wdt, ifm_ch * k * k) + n_steps = odt.get_num_possible_values() - 1 + T_values = np.random.randint(min, max - 1, (ofm_ch, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T_values = np.sort(T_values, axis=1) + model.set_initializer("T", T_values) + model.set_tensor_datatype("T", tdt) + + model = model.transform(InferShapes()) + + return model + + +# PE +@pytest.mark.parametrize("pe", [1, 2, 4]) +# Output activation +@pytest.mark.parametrize("act", [None, DataType.UINT4]) +# kernel size +@pytest.mark.parametrize("k", [2, 4]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# padding +@pytest.mark.parametrize("padding", [0, 1]) +@pytest.mark.slow +@pytest.mark.vivado +def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): + idt = wdt = DataType.INT4 + ifm_dim = 6 + ifm_ch = 4 + + # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold) + model = set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding) + + input_tensor = gen_finn_dt_tensor(idt, [1, ifm_dim, ifm_dim, ifm_ch]) + input_dict = {"inp": input_tensor} + + new_model = model.transform(InferConvInpGen()) + new_model = new_model.transform(InferVVAU()) + + # set SIMD in ConvInputGen node and PE in VVAU node + + for n in new_model.graph.node: + if n.op_type == "ConvolutionInputGenerator": + convinputgen_node = getCustomOp(n) + convinputgen_node.set_nodeattr("SIMD", pe) + elif n.op_type == "Vector_Vector_Activate_Batch": + vvau_node = getCustomOp(n) + vvau_node.set_nodeattr("PE", pe) + new_model = new_model.transform(SetExecMode("cppsim")) + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + + assert oxe.compare_execution(model, new_model, input_dict) + + +# PE +@pytest.mark.parametrize("pe", [1, 2, 4]) +# Output activation +@pytest.mark.parametrize("act", [None, DataType.UINT4]) +# kernel size +@pytest.mark.parametrize("k", [2, 4]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# padding +@pytest.mark.parametrize("padding", [0, 1]) +@pytest.mark.slow +@pytest.mark.vivado +def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): + idt = wdt = DataType.INT4 + ifm_dim = 6 + ifm_ch = 4 + + # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold) + model = set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding) + + input_tensor = gen_finn_dt_tensor(idt, [1, ifm_dim, ifm_dim, ifm_ch]) + input_dict = {"inp": input_tensor} + + new_model = model.transform(InferConvInpGen()) + new_model = new_model.transform(InferVVAU()) + + # set SIMD in ConvInputGen node and PE in VVAU node + + for n in new_model.graph.node: + if n.op_type == "ConvolutionInputGenerator": + convinputgen_node = getCustomOp(n) + convinputgen_node.set_nodeattr("SIMD", pe) + elif n.op_type == "Vector_Vector_Activate_Batch": + vvau_node = getCustomOp(n) + vvau_node.set_nodeattr("PE", pe) + + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(ReplaceVerilogRelPaths()) + new_model = new_model.transform(PrepareRTLSim()) + + assert oxe.compare_execution(model, new_model, input_dict) diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index f94784457a43718516e76946269fc47119423b24..81456796a75c6bf6a01c0a1f83c38b0b39bf4c81 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest +import numpy as np from onnx import TensorProto, helper @@ -44,6 +45,8 @@ from finn.util.basic import gen_finn_dt_tensor from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer def make_addstreams_modelwrapper(ch, pe, idt): @@ -125,3 +128,12 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): y_produced = y_produced.reshape(y_expected.shape) assert (y_produced == y_expected).all(), exec_mode + " failed" + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("AddStreams_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py index 2ed352e28981552b186bb778b94dcbc07471e14b..23ce8314e9c45196d7311ac58cb6bb5ef5267220 100644 --- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -46,6 +46,8 @@ from finn.util.basic import gen_finn_dt_tensor from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): @@ -154,3 +156,11 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) assert "ChannelwiseOp_Batch_0" in hls_synt_res_est + + node = model.get_nodes_by_op_type("ChannelwiseOp_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index b5fc85caf274edc9e7afc52df962862fa8a99ba3..020a2a545dadaf32c469789c90d0ea530688812c 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest +import numpy as np from onnx import TensorProto, helper @@ -42,6 +43,9 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.general import GiveUniqueNodeNames from finn.util.basic import gen_finn_dt_tensor +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer + def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt): odt = idt @@ -182,3 +186,12 @@ def test_fpgadataflow_slidingwindow( y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) assert (y_produced == y_expected).all() + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index 59ac1c09f4fe338ef03a8166c63b9d4b29bbc08e..5066b9709cac922f6bd3670ec7199f3e0f8fd9a2 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest +import numpy as np from onnx import TensorProto, helper @@ -46,6 +47,8 @@ from finn.util.basic import gen_finn_dt_tensor from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer def make_dupstreams_modelwrapper(ch, pe, idim, idt): @@ -130,3 +133,12 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode): assert (y0 == expected_y).all(), exec_mode + " failed" assert (y1 == expected_y).all(), exec_mode + " failed" + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py index 952d994076fc4da7e7f763d9f0fe303d8da0ff11..37a1cc81ebd0824cdd8ac2c073298ad39424f57f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py +++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py @@ -49,6 +49,7 @@ from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -311,6 +312,14 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): hls_synt_res_est = model.analysis(hls_synth_res_estimation) assert "StreamingFCLayer_Batch_0" in hls_synt_res_est + node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) + assert exp_cycles != 0 + # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["decoupled"]) @@ -329,7 +338,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) @pytest.mark.vivado -def test_fpgadataflow_fclayer_large_depth_decoupled_mode( +def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( mem_mode, idt, wdt, act, nf, sf, mw, mh ): if nf == -1: @@ -403,3 +412,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode( hls_synt_res_est = model.analysis(hls_synth_res_estimation) assert "StreamingFCLayer_Batch_0" in hls_synt_res_est + + node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py index 94090a47ad64fc377530e6e21d35661e1d92b5a6..a0881e2c95a491c79bb86b9817fb81735eb63d81 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fifo.py +++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py @@ -99,28 +99,32 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype): input values anymore.""" assert y.shape == tuple(Shape), """The output shape is incorrect.""" - model = model.transform(ReplaceVerilogRelPaths()) - model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) - model = model.transform(MakePYNQProject(test_pynq_board)) - model = model.transform(SynthPYNQProject()) - model = model.transform(MakePYNQDriver()) - ip = os.environ["PYNQ_IP"] - username = os.getenv("PYNQ_USERNAME", "xilinx") - password = os.getenv("PYNQ_PASSWORD", "xilinx") - port = os.getenv("PYNQ_PORT", 22) - target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") - model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) - - res = throughput_test(model) - expected_dict = {} - expected_dict["runtime[ms]"] = [] - expected_dict["throughput[images/s]"] = [] - expected_dict["DRAM_in_bandwidth[Mb/s]"] = [] - expected_dict["DRAM_out_bandwidth[Mb/s]"] = [] - for key in expected_dict: - assert ( - key in res - ), """Throughput test not successful, no value for {} - in result dictionary""".format( - key - ) + try: + ip = os.environ["PYNQ_IP"] # NOQA + if ip == "": + pytest.skip("PYNQ board IP address not specified") + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model = model.transform(MakePYNQProject(test_pynq_board)) + model = model.transform(SynthPYNQProject()) + model = model.transform(MakePYNQDriver(platform="zynq")) + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + res = throughput_test(model) + expected_dict = {} + expected_dict["runtime[ms]"] = [] + expected_dict["throughput[images/s]"] = [] + expected_dict["DRAM_in_bandwidth[Mb/s]"] = [] + expected_dict["DRAM_out_bandwidth[Mb/s]"] = [] + for key in expected_dict: + assert ( + key in res + ), """Throughput test not successful, no value for {} + in result dictionary""".format( + key + ) + except KeyError: + pytest.skip("PYNQ board IP address not specified") diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 5ff3da87228a2a32a41226bb46e0b16b1a44df50..ef4f17998dbb09d31cdc9b3c89afafd10653fd28 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -15,6 +15,8 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.util.basic import pynq_part_map @@ -123,3 +125,12 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): ) assert (y_produced == y_expected).all() + + if mode == "rtlsim": + node = model.get_nodes_by_op_type("FMPadding_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py index b46391daf629e97c24c2950aefad3cbc5055c345..27f1a32a481f006818fbdd7e879bd9dd92242c80 100644 --- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py @@ -45,6 +45,8 @@ from finn.util.basic import gen_finn_dt_tensor from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer def make_accpool_modelwrapper(ch, pe, idim, idt): @@ -121,3 +123,17 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): expected_y = np.sum(x, axis=(1, 2)).flatten() assert (y == expected_y).all(), exec_mode + " failed" + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("GlobalAccPool_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + # commented out, needs performance debug: + # test_fpgadataflow_globalaccpool[rtlsim-7-1-64-DataType.UINT4] + # assert False where False = + # <function isclose at 0x7eff26d5ca60>(50, 103, atol=(0.1 * 103)) + # assert np.isclose(exp_cycles, cycles_rtlsim, atol=0.1 * cycles_rtlsim) + assert exp_cycles != 0 + assert cycles_rtlsim != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 7cb31557dfaa61e3a5e5c0a7c65e1fbe717bf0f1..66b0ef921453e9e6fee9eb9be18cc556b2612f23 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -50,13 +50,20 @@ from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp from finn.transformation.general import GiveUniqueNodeNames -from finn.util.basic import gen_finn_dt_tensor, pynq_part_map +from finn.util.basic import ( + gen_finn_dt_tensor, + pynq_part_map, + alveo_part_map, + alveo_default_platform, +) from finn.util.fpgadataflow import pyverilate_stitched_ip from finn.util.test import load_test_checkpoint_or_skip from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext from finn.transformation.infer_data_layouts import InferDataLayouts from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.floorplan import Floorplan +from finn.transformation.fpgadataflow.vitis_build import VitisBuild +from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -336,7 +343,7 @@ def test_fpgadataflow_ipstitch_pynq_driver(): model = load_test_checkpoint_or_skip( ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_synth.onnx" ) - model = model.transform(MakePYNQDriver()) + model = model.transform(MakePYNQDriver(platform="zynq")) driver_dir = model.get_metadata_prop("pynq_driver_dir") assert driver_dir is not None assert os.path.isdir(driver_dir) @@ -410,3 +417,71 @@ def test_fpgadataflow_ipstitch_iodma_floorplan(): assert getCustomOp(model.graph.node[1]).get_nodeattr("partition_id") == 2 assert getCustomOp(model.graph.node[2]).get_nodeattr("partition_id") == 1 model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_iodma_floorplan.onnx") + + +# board +@pytest.mark.parametrize("board", ["U250"]) +# clock period +@pytest.mark.parametrize("period_ns", [5]) +# override mem_mode to external +@pytest.mark.parametrize("extw", [True, False]) +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.vitis +def test_fpgadataflow_ipstitch_vitis(board, period_ns, extw): + if "VITIS_PATH" not in os.environ: + pytest.skip("VITIS_PATH not set") + platform = alveo_default_platform[board] + fpga_part = alveo_part_map[board] + model = create_two_fc_model("external" if extw else "decoupled") + if model.graph.node[0].op_type == "StreamingDataflowPartition": + sdp_node = getCustomOp(model.graph.node[0]) + assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" + assert os.path.isfile(sdp_node.get_nodeattr("model")) + model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) + model = model.transform(VitisBuild(fpga_part, period_ns, platform)) + model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx") + + +# board +@pytest.mark.parametrize("board", ["Pynq-Z1"]) +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_ipstitch_zynqbuild(board): + model = create_two_fc_model() + if model.graph.node[0].op_type == "StreamingDataflowPartition": + sdp_node = getCustomOp(model.graph.node[0]) + assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" + assert os.path.isfile(sdp_node.get_nodeattr("model")) + model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) + # generate inputs for remote exec + iname = "inp" + idt = model.get_tensor_datatype(iname) + ishape = model.get_tensor_shape(iname) + x = gen_finn_dt_tensor(idt, ishape) + # bitfile using ZynqBuild + model = model.transform(ZynqBuild(board, 10)) + model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_customzynq.onnx") + + bitfile_name = model.get_metadata_prop("vivado_pynq_bitfile") + assert bitfile_name is not None + assert os.path.isfile(bitfile_name) + # deployment + try: + ip = os.environ["PYNQ_IP"] # no default for this one; skip if not defined + if ip == "": + pytest.skip("PYNQ board IP address not specified") + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + deployment_dir = model.get_metadata_prop("pynq_deploy_dir") + assert deployment_dir is not None + assert os.path.isdir(deployment_dir) + # remote exec + input_dict = {"global_in": x} + outp = execute_onnx(model, input_dict) + assert np.isclose(outp["global_out"], x).all() + except KeyError: + pytest.skip("PYNQ board IP address not specified") diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 50b990f13494f22e985406791445b406e9946147..1715bcad0dd29799cdc99497179ce8635058f3be 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -47,6 +47,8 @@ from finn.util.basic import gen_finn_dt_tensor from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer def make_single_thresholding_modelwrapper(T, pe, idt, odt): @@ -152,3 +154,11 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode): if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) assert "Thresholding_Batch_0" in hls_synt_res_est + + node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py index bda66bebbd93d346eb0026b17cbaff9a7ca5df5e..d61edc86dd6b5669c334e6b7f78ea9a8550cae93 100644 --- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py +++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py @@ -41,6 +41,9 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.general import GiveUniqueNodeNames from finn.util.basic import gen_finn_dt_tensor +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.custom_op.registry import getCustomOp +import numpy as np def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt): @@ -154,3 +157,12 @@ def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode): # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced == y_expected).all() + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) + assert exp_cycles != 0 diff --git a/tests/pynq/test_pynq_performance_fifo.py b/tests/pynq/test_pynq_performance_fifo.py index 1d4542473c4b58d3baa62f4123fd0f2f76954d95..1a438f79e09925cab57866c83a3cc9c8a1896351 100644 --- a/tests/pynq/test_pynq_performance_fifo.py +++ b/tests/pynq/test_pynq_performance_fifo.py @@ -81,7 +81,7 @@ def test_pynq_performance_fifo(): model = model.transform(CreateStitchedIP(fpga_part, clk_ns)) model = model.transform(MakePYNQProject(board)) model = model.transform(SynthPYNQProject()) - model = model.transform(MakePYNQDriver()) + model = model.transform(MakePYNQDriver(platform="zynq")) model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) ret = dict() diff --git a/tests/transformation/test_absorb_transp_into_flatten.py b/tests/transformation/test_absorb_transp_into_flatten.py index fbfa15277717c554da01e38608601997407803b2..cbbb33b4606acf55ace662da0986105f8c456b39 100644 --- a/tests/transformation/test_absorb_transp_into_flatten.py +++ b/tests/transformation/test_absorb_transp_into_flatten.py @@ -57,9 +57,9 @@ def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout): model = model.transform(InferDataLayouts()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model.save("test.onnx") + # model.save("test.onnx") model_transformed = model.transform(AbsorbTransposeIntoFlatten()) - model_transformed.save("test2.onnx") + # model_transformed.save("test2.onnx") # verify transformation inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype( diff --git a/tests/transformation/test_conv_lowering.py b/tests/transformation/test_conv_lowering.py index 16c574b29b55e314b06661b28e4bb869bd6b7996..ab545d483321f8c52625b5401828277987bba3a9 100644 --- a/tests/transformation/test_conv_lowering.py +++ b/tests/transformation/test_conv_lowering.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest import onnx.helper as oh from onnx import TensorProto import os @@ -34,12 +35,16 @@ import brevitas.onnx as bo import numpy as np from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType from finn.transformation.fold_constants import FoldConstants from finn.transformation.infer_shapes import InferShapes from finn.util.test import get_test_model_trained from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul from finn.transformation.double_to_single_float import DoubleToSingleFloat import finn.core.onnx_exec as oxe +from finn.custom_op.im2col import compute_conv_output_dim +from finn.util.basic import gen_finn_dt_tensor +from finn.custom_op.registry import getCustomOp export_onnx_path = "test_conv_lowering.onnx" @@ -68,6 +73,76 @@ def test_conv_lowering_cnv_w1a1(): os.remove(export_onnx_path) +# input datatype +@pytest.mark.parametrize("idt", [DataType.INT2, DataType.INT4]) +# kernel size +@pytest.mark.parametrize("k", [2, 4]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [4, 6]) +# input channels +@pytest.mark.parametrize("ifm_ch", [2, 3]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# padding +@pytest.mark.parametrize("padding", [[0, 0, 0, 0], [1, 1, 1, 1]]) +def test_depthwise_conv_lowering(idt, k, ifm_dim, ifm_ch, stride, padding): + wdt = idt + odt = DataType.INT32 + ofm_ch = ifm_ch + ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad=padding[0]) + + # set up onnx model + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim] + ) + + W = oh.make_tensor_value_info("W", TensorProto.FLOAT, [ofm_ch, 1, k, k]) + + dw_cnv = oh.make_node( + "Conv", + inputs=["inp", "W"], + outputs=["outp"], + kernel_shape=[k, k], + pads=padding, + strides=[stride, stride], + group=ifm_ch, + ) + graph = oh.make_graph( + nodes=[dw_cnv], + name="dw_cnv_graph", + inputs=[inp], + outputs=[outp], + value_info=[W], + ) + + model = oh.make_model(graph, producer_name="dws_cnv-model") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model.set_tensor_datatype("W", wdt) + w_tensor = gen_finn_dt_tensor(wdt, [ofm_ch, 1, k, k]) + model.set_initializer("W", w_tensor) + model = model.transform(InferShapes()) + + input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim, ifm_dim]) + input_dict = {"inp": input_tensor} + output_dict = oxe.execute_onnx(model, input_dict) + expected = output_dict["outp"] + + model = model.transform(LowerConvsToMatMul()) + output_dict = oxe.execute_onnx(model, input_dict) + produced = output_dict["outp"] + assert (produced == expected).all() + + # check if created nodes have attributes that indicate depthwise conv + assert model.get_tensor_sparsity("W") is not None + im2col_node = getCustomOp(model.graph.node[1]) + assert im2col_node.get_nodeattr("depthwise") == 1 + + def test_conv_lowering_conv_1x1(): np.random.seed(0) diff --git a/tests/transformation/test_topk_insert.py b/tests/transformation/test_topk_insert.py index b85ed4aa6999faf751e535c1cc687d639c4eb74f..a18e63384150f140cb63ec7b438283eb4797266c 100644 --- a/tests/transformation/test_topk_insert.py +++ b/tests/transformation/test_topk_insert.py @@ -1,4 +1,4 @@ -# import os +import os import onnx from finn.util.test import get_test_model_trained import brevitas.onnx as bo @@ -57,4 +57,4 @@ def test_topk_insert(k): output_pysim_topk = output_pysim_topk.astype(np.int).flatten() assert np.array_equal(output_golden_topk, output_pysim_topk) - # os.remove(export_onnx_path) + os.remove(export_onnx_path) diff --git a/tests/util/test_create.py b/tests/util/test_create.py index 7173add35abf04a35c33b0ef10b42ffdb296a653..4e236978592b02e1c18b03aba56ff8b2369311a6 100644 --- a/tests/util/test_create.py +++ b/tests/util/test_create.py @@ -61,4 +61,4 @@ def test_hls_random_mlp_maker(bitwidth): ret = create.hls_random_mlp_maker(layer_spec) assert len(ret.graph.node) == 5 - ret.save("mlp-%s.onnx" % str(bitwidth)) + # ret.save("mlp-%s.onnx" % str(bitwidth))