diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml index 924fbd24a174df49af4b3e259ad57d0a7907d42b..0233a81ba06dc701a3a4579b9a5bd3ce17e47d04 100644 --- a/.github/workflows/quicktest-dev-pr.yml +++ b/.github/workflows/quicktest-dev-pr.yml @@ -5,7 +5,7 @@ on: branches: [ dev ] push: branches: [ dev ] - + jobs: @@ -18,6 +18,6 @@ jobs: uses: actions/checkout@v2 - name: DockerRunQuicktest - env: - NUM_DEFAULT_WORKERS: 4 - run: sh run-docker.sh quicktest + run: | + docker build -t finn_gha -f docker/Dockerfile.finn_ci --build-arg BUILD_PATH=/tmp/finn_gha . + docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci index d06ff8521555ccd6d09383cab039850f1565fc61..7d5772d9f5118d1f1238dd14a6b57a1b4fd5004d 100644 --- a/docker/Dockerfile.finn_ci +++ b/docker/Dockerfile.finn_ci @@ -30,7 +30,6 @@ FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel MAINTAINER Yaman Umuroglu <yamanu@xilinx.com> ARG PYTHON_VERSION=3.6 ARG BUILD_PATH -ARG FINN_CI_BRANCH WORKDIR /workspace @@ -55,10 +54,9 @@ RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-He # oh-my-xilinx RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx -# checkout desired FINN branch for testing -RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace/finn - -RUN pip install -r /workspace/finn/requirements.txt +COPY requirements.txt . +RUN pip install -r requirements.txt +RUN rm requirements.txt RUN apt update; apt install nano RUN pip install pytest-dependency RUN pip install pytest-xdist @@ -78,8 +76,8 @@ RUN mkdir -p $VIVADO_IP_CACHE WORKDIR /workspace/finn -COPY finn_entrypoint.sh /usr/local/bin/ -COPY quicktest.sh /usr/local/bin/ +COPY docker/finn_entrypoint.sh /usr/local/bin/ +COPY docker/quicktest.sh /usr/local/bin/ RUN chmod 755 /usr/local/bin/finn_entrypoint.sh RUN chmod 755 /usr/local/bin/quicktest.sh ENTRYPOINT ["finn_entrypoint.sh"] diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev index f8919d7498e0e8ef08a52d1da0782988b56d6df4..8c1502eb4a1941061bd58e6f9a18106f98f259e2 100644 --- a/docker/Dockerfile.finn_dev +++ b/docker/Dockerfile.finn_dev @@ -50,7 +50,6 @@ COPY requirements.txt . RUN pip install -r requirements.txt RUN rm requirements.txt RUN pip install jupyter -RUN pip install netron RUN pip install matplotlib RUN pip install pytest-dependency RUN pip install sphinx @@ -81,13 +80,26 @@ RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld # oh-my-xilinx RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx +# netron +RUN git clone https://github.com/lutzroeder/netron.git /workspace/netron + +# build and install netron +USER root +RUN curl -sL https://deb.nodesource.com/setup_12.x | bash - +RUN apt-get install -y nodejs +WORKDIR /workspace/netron +RUN git checkout 376e9d33733a3eacfe3c432808fd46e6cd1460cb +RUN npm install +RUN python setup.py build +RUN pip install /workspace/netron +USER $UNAME # for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host # at /workspace/finn -- see run-docker.sh for an example of how to do this. ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src" ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator" ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards" -ENV PATH "${PATH}:/workspace/oh-my-xilinx" +ENV PATH "${PATH}:/workspace/oh-my-xilinx:/home/$UNAME/.local/bin" ENV OHMYXILINX "/workspace/oh-my-xilinx" WORKDIR /home/$UNAME/finn diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile index 2215bc79cc7b2c20036d882fdc654fbe8721cab6..b2d3102bd4aa3c00620f41c102af5a8b385cede7 100644 --- a/docker/Jenkinsfile +++ b/docker/Jenkinsfile @@ -15,11 +15,13 @@ pipeline { string(name: 'DOCKER_CMD_RTLSIM', defaultValue: """python setup.py test --addopts "-k rtlsim --workers auto" """, description: 'rtlsim test command') // end2end tests: no parallel testing, use NUM_DEFAULT_WORKERS for parallel transformations string(name: 'DOCKER_CMD_END2END', defaultValue: """python setup.py test --addopts "-k end2end" """, description: 'end2end test command') + // allow specifying where to mount the cloned folder from, since Jenkins and FINN may be running in separate containers + string(name: 'WORKSPACE_MOUNT', defaultValue: '/var/jenkins_home/workspace/finn', description: 'Path to Jenkins workspace mount') } environment { DOCKER_TAG='finn_ci:$BUILD_ID' - DOCKER_INST_NAME='finn_ci_$BUILD_ID' - BUILD_PATH='/tmp/finn_ci_$BUILD_ID' + DOCKER_INST_NAME='finn_ci' + BUILD_PATH='/tmp/finn_ci' } stages { stage("Clone") { @@ -32,17 +34,17 @@ pipeline { sh """ docker build -t $DOCKER_TAG -f docker/Dockerfile.finn_ci \ --build-arg BUILD_PATH=$BUILD_PATH \ - --build-arg FINN_CI_BRANCH=${params.FINN_CI_BRANCH} \ - docker/ + . """ } } stage('test-main') { steps { - catchError { + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { sh """ docker run --init \ --hostname $DOCKER_INST_NAME \ + -v ${params.WORKSPACE_MOUNT}:/workspace/finn \ -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \ -e NUM_DEFAULT_WORKERS=1 \ -e FINN_INST_NAME=$DOCKER_INST_NAME \ @@ -58,10 +60,11 @@ pipeline { } stage('test-rtlsim') { steps { - catchError { + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { sh """ docker run --init \ --hostname $DOCKER_INST_NAME \ + -v ${params.WORKSPACE_MOUNT}:/workspace/finn \ -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \ -e NUM_DEFAULT_WORKERS=1 \ -e FINN_INST_NAME=$DOCKER_INST_NAME \ @@ -77,10 +80,11 @@ pipeline { } stage('test-end2end') { steps { - catchError { + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { sh """ docker run --init \ --hostname $DOCKER_INST_NAME \ + -v ${params.WORKSPACE_MOUNT}:/workspace/finn \ -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \ -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \ -e FINN_INST_NAME=$DOCKER_INST_NAME \ diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index b312737c317517ca0ab19c74cf22284b5977b661..ee75089c657e4fad1e4a455ac7bd5fe4976e5d4c 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -1,6 +1,5 @@ #!/bin/bash -export XILINX_VIVADO=$VIVADO_PATH export SHELL=/bin/bash export FINN_ROOT=/workspace/finn @@ -15,7 +14,7 @@ gecho () { # the repos themselves are cloned in the Dockerfile BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=8f9f2018762f654f196b666838aeaf6fc730ad9a +HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada @@ -48,7 +47,14 @@ gecho "oh-my-xilinx @ $OMX_COMMIT" git -C /workspace/oh-my-xilinx pull --quiet git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT --quiet -# source Vivado env.vars -source $VIVADO_PATH/settings64.sh - +if [ ! -z "$VIVADO_PATH" ];then + # source Vivado env.vars + export XILINX_VIVADO=$VIVADO_PATH + source $VIVADO_PATH/settings64.sh +fi +if [ ! -z "$VITIS_PATH" ];then + # source Vitis env.vars + export XILINX_VITIS=$VITIS_PATH + source $VITIS_PATH/settings64.sh +fi exec "$@" diff --git a/docker/quicktest.sh b/docker/quicktest.sh index 49b7886836ac4e45dad856dfcd49223276bd831a..75d07d15338fd422bc6749b0a61b392616c61c5a 100755 --- a/docker/quicktest.sh +++ b/docker/quicktest.sh @@ -3,11 +3,10 @@ : ${PYTEST_PARALLEL=auto} cd $FINN_ROOT - # check if command line argument is empty or not present if [ -z $1 ]; then echo "Running quicktest: not (vivado or slow) with pytest-xdist" - python setup.py test --addopts "-m 'not (vivado or slow)' --dist=loadfile -n $PYTEST_PARALLEL" + python setup.py test --addopts "-m 'not (vivado or slow or vitis)' --dist=loadfile -n $PYTEST_PARALLEL" elif [ $1 = "main" ]; then echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist" python setup.py test --addopts "-k not (rtlsim or end2end) --dist=loadfile -n $PYTEST_PARALLEL" diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 8b20cebcfc49d14d0afbb26edd678d65425476d3..323692897800d45c6e6cf55b688a2c7b2b9a5277 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -13,7 +13,7 @@ The FINN compiler should not be thought of a single pushbutton tool that does ev Requirements ============ -* Ubuntu 18.04 +* Ubuntu 18.04 with `bash` installed * Docker * A working Vivado 2019.1 installation * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located) @@ -26,9 +26,11 @@ We use Docker extensively for developing and deploying FINN. If you are not fami Getting an interactive shell for development or experimentation *************************************************************** +.. note:: **run-docker.sh requires bash to execute correctly.** + :: - sh run_docker.sh + ./run_docker.sh Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation. If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`. @@ -41,7 +43,7 @@ Running the Jupyter notebooks ***************************** :: - sh run-docker.sh notebook + ./run-docker.sh notebook This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones. .. note:: The link will look something like this (the token you get will be different): @@ -57,14 +59,14 @@ by: :: - sh run-docker.sh test + ./run-docker.sh test There is a quicker variant of the test suite that skips the tests marked as requiring Vivado or as slow-running tests: :: - sh run-docker.sh quicktest + ./run-docker.sh quicktest If you want to run individual tests, you can do this *inside the Docker container from the FINN root directory* as follows: diff --git a/run-docker.sh b/run-docker.sh index 00ca8f86985a78d8f2af099c51dcd4b80cd2e974..88956586c6a2ba9780d0597f8149038dad4aa6ab 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -50,6 +50,15 @@ if [ -z "$PYNQ_IP" ];then recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests." fi +if [ -z "$VITIS_PATH" ];then + recho "Please set the VITIS_PATH that contains the path to your Vitis installation directory." + recho "FINN functionality depending on Vitis will not be available." +else + if [ -z "$PLATFORM_REPO_PATHS" ];then + recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)." + fi +fi + DOCKER_GID=$(id -g) DOCKER_GNAME=$(id -gn) DOCKER_UNAME=$(id -un) @@ -93,6 +102,7 @@ mkdir -p $FINN_SSH_KEY_DIR gecho "Instance is named as $DOCKER_INST_NAME" gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL" gecho "Mounting $VIVADO_PATH into $VIVADO_PATH" +gecho "Mounting $VITIS_PATH into $VITIS_PATH" gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT" gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT" gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE" @@ -128,24 +138,34 @@ docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \ # Launch container with current directory mounted # important to pass the --init flag here for correct Vivado operation, see: # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins -docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \ ---hostname $DOCKER_INST_NAME \ --e "XILINX_VIVADO=$VIVADO_PATH" \ --e "SHELL=/bin/bash" \ --v $SCRIPTPATH:/workspace/finn \ --v $BUILD_LOCAL:$BUILD_LOCAL \ --v $VIVADO_PATH:$VIVADO_PATH \ --v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh \ --e VIVADO_PATH=$VIVADO_PATH \ --e FINN_INST_NAME=$DOCKER_INST_NAME \ --e FINN_ROOT="/workspace/finn" \ --e VIVADO_IP_CACHE="$VIVADO_IP_CACHE" \ --e PYNQ_BOARD=$PYNQ_BOARD \ --e PYNQ_IP=$PYNQ_IP \ --e PYNQ_USERNAME=$PYNQ_USERNAME \ --e PYNQ_PASSWORD=$PYNQ_PASSWORD \ --e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR \ --e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS \ --p $JUPYTER_PORT:$JUPYTER_PORT \ --p $NETRON_PORT:$NETRON_PORT \ -$DOCKER_TAG $DOCKER_CMD +DOCKER_EXEC="docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init " +DOCKER_EXEC+="--hostname $DOCKER_INST_NAME " +DOCKER_EXEC+="-e SHELL=/bin/bash " +DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn " +DOCKER_EXEC+="-v $BUILD_LOCAL:$BUILD_LOCAL " +DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh " +DOCKER_EXEC+="-e FINN_INST_NAME=$DOCKER_INST_NAME " +DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" " +DOCKER_EXEC+="-e VIVADO_IP_CACHE=$VIVADO_IP_CACHE " +DOCKER_EXEC+="-e PYNQ_BOARD=$PYNQ_BOARD " +DOCKER_EXEC+="-e PYNQ_IP=$PYNQ_IP " +DOCKER_EXEC+="-e PYNQ_USERNAME=$PYNQ_USERNAME " +DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD " +DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR " +DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS " +DOCKER_EXEC+="-p $JUPYTER_PORT:$JUPYTER_PORT " +DOCKER_EXEC+="-p $NETRON_PORT:$NETRON_PORT " +if [ ! -z "$VIVADO_PATH" ];then + DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" " + DOCKER_EXEC+="-v $VIVADO_PATH:$VIVADO_PATH " + DOCKER_EXEC+="-e VIVADO_PATH=$VIVADO_PATH " +fi +if [ ! -z "$VITIS_PATH" ];then + DOCKER_EXEC+="-v $VITIS_PATH:$VITIS_PATH " + DOCKER_EXEC+="-v $PLATFORM_REPO_PATHS:/workspace/finn/vitis_platforms " + DOCKER_EXEC+="-e VITIS_PATH=$VITIS_PATH " + DOCKER_EXEC+="-e PLATFORM_REPO_PATHS=/workspace/finn/vitis_platforms " +fi +DOCKER_EXEC+="$DOCKER_TAG $DOCKER_CMD" + +$DOCKER_EXEC diff --git a/setup.cfg b/setup.cfg index 1d7dcf247636b486e35d6320669eae706c2b7a72..7729d0949ee133e06242905afab31708e79ebf04 100644 --- a/setup.cfg +++ b/setup.cfg @@ -104,6 +104,7 @@ addopts = markers = slow: marks tests as slow (deselect with '-m "not slow"') vivado: mark tests that require Vivado or Vivado HLS + vitis: mark tests that require Vitis norecursedirs = dist build diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py index 1e1bee3aa7435d5cab6cbf5ea23dd37dcdfa4380..bb5b3075582b8e01e8eed95f709934302fcadb42 100644 --- a/src/finn/core/rtlsim_exec.py +++ b/src/finn/core/rtlsim_exec.py @@ -114,19 +114,19 @@ def rtlsim_exec(model, execution_context): def _reset_rtlsim(sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" - sim.io.ap_rst_n_0 = 0 + sim.io.ap_rst_n = 0 _toggle_clk(sim) _toggle_clk(sim) - sim.io.ap_rst_n_0 = 1 + sim.io.ap_rst_n = 1 _toggle_clk(sim) _toggle_clk(sim) def _toggle_clk(sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk_0 = 0 + sim.io.ap_clk = 0 sim.eval() - sim.io.ap_clk_0 = 1 + sim.io.ap_clk = 1 sim.eval() @@ -140,7 +140,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True): from finn.util.fpgadataflow)""" inputs = inp outputs = [] - sim.io.out_r_0_tready = 1 + sim.io.m_axis_0_tready = 1 # observe if output is completely calculated # observation_count will contain the number of cycles the calculation ran @@ -159,12 +159,12 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True): _reset_rtlsim(sim) while not (output_observed): - sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0 - sim.io.in0_V_V_0_tdata = inputs[0] if len(inputs) > 0 else 0 - if sim.io.in0_V_V_0_tready == 1 and sim.io.in0_V_V_0_tvalid == 1: + sim.io.s_axis_0_tvalid = 1 if len(inputs) > 0 else 0 + sim.io.s_axis_0_tdata = inputs[0] if len(inputs) > 0 else 0 + if sim.io.s_axis_0_tready == 1 and sim.io.s_axis_0_tvalid == 1: inputs = inputs[1:] - if sim.io.out_r_0_tvalid == 1 and sim.io.out_r_0_tready == 1: - outputs = outputs + [sim.io.out_r_0_tdata] + if sim.io.m_axis_0_tvalid == 1 and sim.io.m_axis_0_tready == 1: + outputs = outputs + [sim.io.m_axis_0_tdata] _toggle_clk(sim) observation_count = observation_count + 1 diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 71c731f96ca45519c443a5f932ead050770e17de..bc816f18c5f72338dc726e504182998f3f4430b7 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -102,6 +102,23 @@ class HLSCustomOp(CustomOp): prefixed_top_name = "%s_%s" % (node.name, node.name) return prefixed_top_name + def get_verilog_top_module_intf_names(self): + """Return a dict of names of input and output interfaces. + The keys reflect the protocols each interface implements: + 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. + Values are lists of names: + 's_axis' names correspond to the list of node inputs in order, + 'm_axis' names correspond to the list of node outputs in order' + Each block must have at most one aximm and one axilite.""" + intf_names = {} + intf_names["clk"] = ["ap_clk"] + intf_names["rst"] = ["ap_rst_n"] + intf_names["s_axis"] = ["in0_V_V"] + intf_names["m_axis"] = ["out_V_V"] + intf_names["aximm"] = [] + intf_names["axilite"] = [] + return intf_names + def get_verilog_top_filename(self): "Return the Verilog top module filename for this node." diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py index d5f5c1194d36e86b895610c084222db5ab9eb2bf..d73f22672e7163eef0738d067f951e90fe80a89f 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py @@ -356,3 +356,8 @@ class AddStreams_Batch(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + intf_names["s_axis"] = ["in0_V_V", "in1_V_V"] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index 54051af5e0387081a23e1f8fa77ec9e363098830..e4762509fb6246bafa7441e194312d69ad585d1b 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -32,7 +32,7 @@ import numpy as np from finn.core.datatype import DataType from finn.custom_op.fpgadataflow import HLSCustomOp -from onnx import TensorProto, helper +from onnx import helper, TensorProto from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -80,24 +80,33 @@ class DuplicateStreams_Batch(HLSCustomOp): def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) assert ishape == exp_ishape, "Unexpected input shape." - # implement tensor with correct shape - values = np.random.randn(*oshape).astype(np.float32) + + oshape = self.get_normal_output_shape() + values = np.zeros(oshape).astype(np.float32) split_input = np.concatenate((values, values), axis=0) - return helper.make_node( + + split_in = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, oshape + ) + + model.graph.value_info.append(split_in) # requires clean up + model.set_initializer(split_in.name, split_input) + + shape_comp_node = helper.make_node( "Split", - inputs=[split_input], - outputs=[self.onnx_node.output[0], self.onnx_node.output[0]], - value=helper.make_tensor( - name="const_tensor", data_type=TensorProto.FLOAT, axis=0 - ), + inputs=[split_in.name], + outputs=[self.onnx_node.output[0], self.onnx_node.output[1]], + axis=0, ) + return shape_comp_node + def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(self.onnx_node.output[0], odt) + model.set_tensor_datatype(self.onnx_node.output[1], odt) def verify_node(self): info_messages = [] @@ -359,3 +368,8 @@ class DuplicateStreams_Batch(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + intf_names["m_axis"] = ["out0_V_V", "out1_V_V"] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index 9b718ecbbc490610790b68871080de23a54f4891..05870b8d9d5d3a11bad7882c9a7d122f8cd34cf6 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -344,3 +344,15 @@ class IODMA(HLSCustomOp): def strm_decl(self): pass + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + if self.get_nodeattr("direction") == "out": + intf_names["s_axis"] = ["in0_V_V"] + intf_names["m_axis"] = [] + else: + intf_names["s_axis"] = [] + intf_names["m_axis"] = ["out_V_V"] + intf_names["axilite"] = ["s_axi_control"] + intf_names["aximm"] = ["m_axi_gmem"] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py index c7edc24d0e24eef1154293caca2519ab3aa68358..801a634fdba1cd5e16c7c211175c1e7380bf0070 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -39,16 +39,18 @@ class Pool_Batch(HLSCustomOp): """Class that corresponds to finn-hlslib Pool_batch function. Requires ConvolutionInputGenerator(depthwise == 1) to format its input - TODO: explain input shape (to reuse im2col code) Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels) Output shape (BatchSize,OutImgDim,OutImgDim,Channels) - # note: the actual data layout produced by the hlslib kernels is different - # for depthwise ops. - # * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + Notes: + # The input shape was chosen to be compatible with im2col (only true when there + is not folding). + + # The actual data layout produced by the hlslib kernels is different + for depthwise ops. + * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) Channels can be folded using PE (SIMD from the input perspective) - TODO: doc """ def get_nodeattr_types(self): @@ -63,7 +65,10 @@ class Pool_Batch(HLSCustomOp): "Function": ("s", True, ""), "OutImgDim": ("i", True, 0), # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), + "InputDataType": ("s", True, ""), + "OutputDataType": ("s", True, ""), + "AccumBits": ("i", False, 0), + "Size": ("i", False, 1), "BatchSize": ("i", False, 1), } @@ -72,17 +77,28 @@ class Pool_Batch(HLSCustomOp): def get_input_datatype(self): """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] + return DataType[self.get_nodeattr("InputDataType")] def get_output_datatype(self): """Returns FINN DataType of output.""" fxn = self.get_nodeattr("Function") + odt = DataType[self.get_nodeattr("OutputDataType")] + if fxn == "MaxPool": # Same as input - return DataType[self.get_nodeattr("dataType")] + idt = DataType[self.get_nodeattr("InputDataType")] + assert odt == idt, "In datatype must be equal to out datatype for Maxpool" + elif fxn == "QuantAvgPool": + idt = DataType[self.get_nodeattr("InputDataType")] + assert ( + idt.signed() == odt.signed() + ), """QuantAvgPool: Can't mix signed + and unsigned datatypes""" else: raise Exception("Pool_Batch doesn't currently support " + fxn) + return odt + def get_normal_input_shape(self): ifm_ch = self.get_nodeattr("Channels") odim = self.get_nodeattr("OutImgDim") @@ -123,19 +139,14 @@ class Pool_Batch(HLSCustomOp): def get_instream_width(self): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") - # ofm_ch = self.get_nodeattr("Channels") - # k = self.get_nodeattr("KernelSize") - # assert ifm_ch % pe == 0, "PE must divide input channels" - # simd = int(ifm_ch/pe) in_width = int(dt_bits * pe) return in_width def get_outstream_width(self): - fxn = self.get_nodeattr("Function") - if fxn == "MaxPool": - return self.get_instream_width() - else: - raise Exception("Pool_Batch doesn't currently support " + fxn) + dt_bits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = int(dt_bits * pe) + return out_width def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() @@ -187,7 +198,7 @@ class Pool_Batch(HLSCustomOp): # check supported function fnx = self.get_nodeattr("Function") - if fnx == "MaxPool": + if fnx in ["MaxPool", "QuantAvgPool"]: info_messages.append( "Attribute Function contains a supported pool function" ) @@ -251,7 +262,8 @@ class Pool_Batch(HLSCustomOp): i_hls_dt = idt.get_hls_datatype_str() odt = self.get_output_datatype() o_hls_dt = odt.get_hls_datatype_str() - + size = self.get_nodeattr("Size") + accum_bits = self.get_nodeattr("AccumBits") self.code_gen_dict["$DOCOMPUTE$"] = [] fxn = self.get_nodeattr("Function") @@ -259,6 +271,16 @@ class Pool_Batch(HLSCustomOp): self.code_gen_dict["$DOCOMPUTE$"] += [ "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt) ] + elif fxn == "QuantAvgPool": + if idt.signed(): + act_hls_dt = "ap_int<{}>".format(accum_bits) + else: + act_hls_dt = "ap_uint<{}>".format(accum_bits) + self.code_gen_dict["$DOCOMPUTE$"] += [ + "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format( + act_hls_dt, o_hls_dt, size + ) + ] else: raise Exception("Pool_Batch doesn't currently support " + fxn) @@ -369,7 +391,7 @@ class Pool_Batch(HLSCustomOp): super().reset_rtlsim(sim) super().toggle_clk(sim) rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt + odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() out_npy_path = "{}/output.npy".format(code_gen_dir) diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index a7ebff68749120868cae9ce5ac18d2856fe2cb8a..9c3bd3ac87b94f3e0ff11a2937bf5083aae614f6 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -87,7 +87,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): "numInputVectors": ("ints", False, [1]), # memory mode for the FC weights # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer "mem_mode": ("s", False, "const"), # FPGA resource type for memories in decoupled mode # auto -- let Vivado decide @@ -105,14 +106,14 @@ class StreamingFCLayer_Batch(HLSCustomOp): node = self.onnx_node # set top name depending on mem_mode mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "const" or mem_mode == "external": prefixed_top_name = "%s_%s" % (node.name, node.name) elif mem_mode == "decoupled": prefixed_top_name = "%s_memstream" % (node.name) else: raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) return prefixed_top_name @@ -301,7 +302,10 @@ class StreamingFCLayer_Batch(HLSCustomOp): def get_weightstream_width(self): """Returns weight stream width. Used only in decoupled mode.""" - if self.get_nodeattr("mem_mode") == "decoupled": + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() @@ -484,7 +488,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): def generate_params(self, model, path): mem_mode = self.get_nodeattr("mem_mode") - # weights + code_gen_dir = path + # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) @@ -493,7 +498,6 @@ class StreamingFCLayer_Batch(HLSCustomOp): # so use it as such for weight generation if self.get_weight_datatype() == DataType.BIPOLAR: export_wdt = DataType.BINARY - code_gen_dir = path if mem_mode == "const": """Saves weights into params.h""" @@ -523,7 +527,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): f_weights.write(weight_hls_code) f_weights.close() - elif mem_mode == "decoupled": + elif mem_mode == "decoupled" or mem_mode == "external": """Saves weights in corresponding file format for cppsim or rtlsim""" # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) @@ -552,37 +556,37 @@ class StreamingFCLayer_Batch(HLSCustomOp): os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped ) - """Saves weights into .dat file""" - # convert weight values into hexstring - weight_width = self.get_weightstream_width() - # pad to nearest 4 bits to get hex strings - weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" - ) - weight_stream_len = np.prod(weight_tensor_pe_flipped.shape) - factor = math.ceil(weight_stream_len / 1024) - # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_pe_flipped.flatten() - pad_amt = (factor * 1024) - weight_stream_len - weight_stream = np.pad( - weight_stream, (0, pad_amt), mode="constant", constant_values="0" - ) - weight_stream = weight_stream.copy() - i = 0 - j = 0 - for val in weight_stream: - if i == 1024: - i = 0 - j += 1 - with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f: - f.write(val + "\n") - i += 1 - + if mem_mode == "decoupled": + """Saves weights into .dat file""" + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream_len = np.prod(weight_tensor_pe_flipped.shape) + factor = math.ceil(weight_stream_len / 1024) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + pad_amt = (factor * 1024) - weight_stream_len + weight_stream = np.pad( + weight_stream, (0, pad_amt), mode="constant", constant_values="0" + ) + weight_stream = weight_stream.copy() + i = 0 + j = 0 + for val in weight_stream: + if i == 1024: + i = 0 + j += 1 + with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f: + f.write(val + "\n") + i += 1 else: raise Exception( - """Please set mem_mode to "const"i or "decoupled", currently no other - parameter value is supported!""" + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) # save thresholds in thresh.h @@ -630,6 +634,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node # TODO ensure codegen dir exists @@ -698,7 +703,24 @@ class StreamingFCLayer_Batch(HLSCustomOp): ) super().reset_rtlsim(sim) super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + if mem_mode == "external": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType.BIPOLAR: + export_wdt = DataType.BINARY + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + io_dict = { + "inputs": {"in0": inp, "weights": wei}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -729,12 +751,12 @@ class StreamingFCLayer_Batch(HLSCustomOp): if mem_mode == "const": # self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] pass - elif mem_mode == "decoupled": + elif mem_mode == "decoupled" or mem_mode == "external": self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] else: raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) if self.calc_tmem() != 0: # TODO find a better way of checking for no pregenerated thresholds @@ -757,7 +779,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): numReps, ) ] - if mem_mode == "decoupled": + if mem_mode == "decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() self.code_gen_dict["$DEFINES$"].append( "#define WP1 {}\n".format(wdt.bitwidth()) @@ -783,7 +805,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() elem_bits = wdt.bitwidth() packed_bits = self.get_weightstream_width() @@ -807,7 +829,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) ) - if mem_mode == "decoupled": + if mem_mode == "decoupled" or mem_mode == "external": self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> weights ("weights");'.format( self.get_weightstream_width() @@ -835,7 +857,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): self.get_nodeattr("resType"), ) ] - elif mem_mode == "decoupled": + elif mem_mode == "decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() if wdt == DataType.BIPOLAR: export_wdt = DataType.BINARY @@ -856,8 +878,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): else: raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) def dataoutstrm(self): @@ -903,7 +925,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): self.get_outstream_width(), ) ] - elif mem_mode == "decoupled": + elif mem_mode == "decoupled" or mem_mode == "external": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}( hls::stream<ap_uint<{}>> &in0, @@ -952,7 +974,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): "complete dim=1" ) ) - elif mem_mode == "decoupled": + elif mem_mode == "decoupled" or mem_mode == "external": self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=weights" ) @@ -962,8 +984,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): else: raise Exception( - """Please set mem_mode to "const", currently no other - parameter value is supported!""" + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" ) # the threshold tensor is acc_type [PE][TMEM][N_THRES] @@ -1092,3 +1114,10 @@ class StreamingFCLayer_Batch(HLSCustomOp): ) self.set_nodeattr("ip_vlnv", vlnv) self.code_gen_dict.clear() + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "external": + intf_names["s_axis"] = ["in0_V_V", "weights_V_V"] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 17ba44b959577faf573d77ae222f7b2a3be6669d..38a139c279701ae7892f41b63c3c717a3e736691 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -33,8 +33,9 @@ class TLastMarker(HLSCustomOp): """Node that adds/removes AXI stream TLAST signals where needed. Its behavior is transparent in node-by-node execution, only visible in IP-stitched rtlsim or actual hardware. - This node may be needed at the end of the network to signal a DMA write (needed by the - FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read.""" + This node may be needed at the end of the network to signal a DMA write + (needed by the FINN PYNQ shell) or at the beginning to remove the end-of-burst + from DMA read.""" def __init__(self, onnx_node): super().__init__(onnx_node) @@ -239,3 +240,15 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<OutDType> out ("out");' ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + if self.get_nodeattr("Direction") == "in": + intf_names["s_axis"] = ["in0"] + intf_names["m_axis"] = ["out_V_V"] + else: + intf_names["s_axis"] = ["in0_V_V"] + intf_names["m_axis"] = ["out_r"] + if self.get_nodeattr("DynIters") == 1: + intf_names["axilite"] = ["s_axi_control"] + return intf_names diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py index fb5c78bc0c8419ba519c5c3113d9b0c7ae2dd3b7..28d01069264d883f3afc400808470f5f303be799 100644 --- a/src/finn/custom_op/quantavgpool2d.py +++ b/src/finn/custom_op/quantavgpool2d.py @@ -75,6 +75,19 @@ class QuantAvgPool2d(CustomOp): raise Exception("Unsupported output datatype for QuantAvgPool2d") model.set_tensor_datatype(node.output[0], dtype) + def get_accum_size(self): + ibits = self.get_nodeattr("ibits") + k = self.get_nodeattr("kernel") + max_value = 2 ** ibits - 1 + max_value = max_value * k * k + max_bit_width = int(max_value).bit_length() + return max_bit_width + + def get_shifts(self): + shift_bits = self.get_accum_size() - self.get_nodeattr("obits") + shift_bits = shift_bits if shift_bits >= 0 else 0 + return shift_bits + def execute_node(self, context, graph): # create a standard average pooling node to help calculate the result node = self.onnx_node @@ -107,12 +120,7 @@ class QuantAvgPool2d(CustomOp): result_temp = sess.run(None, idict) # remove scaling introduced by average result_temp = result_temp[0] * (k * k) - ibits = self.get_nodeattr("ibits") - max_value = 2 ** ibits - 1 - max_value = max_value * k * k - max_bit_width = int(max_value).bit_length() - shift_bits = max_bit_width - self.get_nodeattr("obits") - result = np.right_shift(result_temp.astype(int), shift_bits) + result = np.right_shift(result_temp.astype(int), self.get_shifts()) if self.get_nodeattr("data_layout") == "NHWC": result = result.transpose(0, 2, 3, 1) context[node.output[0]] = result.astype(np.float32) diff --git a/src/finn/custom_op/streamingdataflowpartition.py b/src/finn/custom_op/streamingdataflowpartition.py index b63326d676f4ded5ec1dd62f5cc7f02d7acb82ad..bce4dde426b8838d6c86638a3641d51ab259a6db 100644 --- a/src/finn/custom_op/streamingdataflowpartition.py +++ b/src/finn/custom_op/streamingdataflowpartition.py @@ -83,7 +83,7 @@ class StreamingDataflowPartition(CustomOp): ) # verify the number of inputs - if len(self.onnx_node.input) == 1: + if len(self.onnx_node.input) >= 1: info_messages.append("The number of inputs is correct") else: info_messages.append("StreamingDataflowPartition needs 1 data input") diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 34a697a43426aae0f984770689552063aa35b9e8..4cdf138130f37809357b281155d260fdbd789e12 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from onnx import helper, TensorProto import numpy as np @@ -34,11 +35,11 @@ from finn.transformation import Transformation from finn.custom_op.registry import getCustomOp from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.general import SortGraph import finn.core.data_layout as DataLayout from finn.util.onnx import nchw_to_nhwc import warnings from finn.util.basic import get_by_name -import warnings class InferConvInpGen(Transformation): @@ -107,6 +108,7 @@ class InferConvInpGen(Transformation): Padding=2 * pad, NumChannels=ifm_ch, inputDataType=dt.name, + SIMD=ifm_ch, ) graph.node.insert(node_ind, padding_node) @@ -210,13 +212,16 @@ class InferPool_Batch(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type in ["MaxPool"]: + if n.op_type in ["MaxPool", "QuantAvgPool2d"]: # extract pool parameters - k = get_by_name(n.attribute, "kernel_shape").ints[-1] - stride = get_by_name(n.attribute, "strides").ints[-1] - if k <= stride: - continue + if n.op_type == "MaxPool": + k = get_by_name(n.attribute, "kernel_shape").ints[-1] + stride = get_by_name(n.attribute, "strides").ints[-1] + elif n.op_type == "QuantAvgPool2d": + inst = getCustomOp(n) + k = inst.get_nodeattr("kernel") + stride = inst.get_nodeattr("stride") try: pad = get_by_name(n.attribute, "pads").ints[-1] @@ -226,10 +231,21 @@ class InferPool_Batch(Transformation): node_input = n.input[0] node_output = n.output[0] idt = model.get_tensor_datatype(node_input) + if not idt.is_integer(): continue - # odt = model.get_tensor_datatype(node_output) + if k < stride: + continue + elif k == stride: + warnings.warn( + """Inferring Pool_Batch node for k == stride. + This case can be optimized. + For example, for MaxPool run InferStreamingMaxPool before + InferPool_Batch """ + ) + + odt = model.get_tensor_datatype(node_output) ifm_ch = model.get_tensor_shape(n.input[0])[1] # assume NCHW ofm_ch = ifm_ch @@ -269,9 +285,22 @@ class InferPool_Batch(Transformation): "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] ) + accum_bits = 0 + pool_size_param = k + pad_value = 0 if n.op_type == "MaxPool": pool_fxn = "MaxPool" + odt = idt pad_value = idt.min() + elif n.op_type == "QuantAvgPool2d": + assert odt.is_integer(), """Output data type for QuantAvgPool2d + needs to be integer""" + assert pad == 0, "Padding is not supported for QuantAvgPool2d" + inst = getCustomOp(n) + pool_fxn = "QuantAvgPool" + pool_size_param = inst.get_shifts() + accum_bits = inst.get_accum_size() + else: raise Exception( "pad_value and pool_fxn not configured for {}".format(n.op_type) @@ -301,12 +330,15 @@ class InferPool_Batch(Transformation): [pool_output], domain="finn", backend="fpgadataflow", - dataType=idt.name, + InputDataType=idt.name, + OutputDataType=odt.name, Channels=ifm_ch, PE=ifm_ch, KernelSize=k, Function=pool_fxn, OutImgDim=ofm_dim, + AccumBits=accum_bits, + Size=pool_size_param, BatchSize=1, ) @@ -616,10 +648,21 @@ class InferThresholdingLayer(Transformation): if not idt.is_integer(): continue - # skip conversion if input is not NHWC or NC + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary thl_in_layout = model.get_tensor_layout(thl_input) - if thl_in_layout != DataLayout.NHWC and thl_in_layout != DataLayout.NC: - continue + if thl_in_layout == DataLayout.NCHW: + thl_input = nchw_to_nhwc(thl_input, model, node_ind) + node_ind += 1 + thl_in_shape = model.get_tensor_shape(thl_input) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + thl_output_layout = model.get_tensor_layout(thl_output) + if thl_output_layout == DataLayout.NCHW: + thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True) + node_ind += 1 # now safe to assume number of channels is in last dimension ifc = int(thl_in_shape[-1]) @@ -641,7 +684,7 @@ class InferThresholdingLayer(Transformation): outputDataType=odt.name, numInputVectors=list(thl_in_shape[:-1]), ) - graph.node.insert(node_ind, new_node) + graph.node.insert(insert_point, new_node) # remove old node graph.node.remove(node) graph_modified = True @@ -652,6 +695,166 @@ class InferThresholdingLayer(Transformation): return (model, graph_modified) +class InferAddStreamsLayer(Transformation): + """Convert any Add into a AddStreams HLS layer.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Add": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) + + # skip if different data types on inputs + if idt0 != idt1: + continue + + idt = idt0 + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 + assert ( + num_channels % pe == 0 + ), "Requirement Channels divisable by PE is violated." + + # create and insert new StreamingFCLayer node + new_node = helper.make_node( + "AddStreams_Batch", + [in0, in1], + [result], + domain="finn", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType=idt.name, + numInputVectors=in0_shape[:-1], + ) + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferDuplicateStreamsLayer(Transformation): + """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2 """ + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + successors = model.find_consumers(node.output[0]) + if successors is not None and len(successors) == 2: + output_tensor = node.output[0] + + dt = model.get_tensor_datatype(output_tensor) + + # skip conversion for layers with float input + if not dt.is_integer(): + continue + + # create clone tensors + out_shape = model.get_tensor_shape(output_tensor) + out_tensor_clones = [] + for i in range(2): + clone = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(clone) + out_tensor_clones += [clone.name] + + num_ch = int(out_shape[-1]) + vecs = out_shape[:-1] + + # create node with no parallelization first + pe = 1 + assert ( + num_ch % pe == 0 + ), "Requirement channels divisable by PE is violated." + + dup_node = helper.make_node( + "DuplicateStreams_Batch", + [output_tensor], + out_tensor_clones, + domain="finn", + backend="fpgadataflow", + NumChannels=num_ch, + PE=pe, + inputDataType=dt.name, + numInputVectors=vecs, + ) + + graph.node.insert(node_ind, dup_node) + + # connect successors to out tensor clone + clone_idx = 0 + for successor in successors: + for i, succ_input in enumerate(successor.input): + if succ_input == output_tensor: + successor.input[i] = out_tensor_clones[clone_idx] + clone_idx += 1 + # if one node has multiple connections to the same output + # find_direct_successors will return one node per input + # so break the inner loop will result in correct behaviour + break + + graph_modified = True + + if graph_modified: + model = model.transform(SortGraph()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferChannelwiseLinearLayer(Transformation): """Convert any channel-wise Add/Mul into a HLS layer.""" @@ -807,6 +1010,64 @@ class InferChannelwiseLinearLayer(Transformation): return (model, graph_modified) +class InferLabelSelectLayer(Transformation): + """Convert any TopK into a LabelSelect HLS layer.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "TopK": + fc_input = node.input[0] + k_input = node.input[1] + val_output = node.output[0] + idx_output = node.output[1] + fc_in_shape = model.get_tensor_shape(fc_input) + + idt = model.get_tensor_datatype(fc_input) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # skip conversion for if value output is connected (not supported) + if model.find_consumer(val_output) is not None: + continue + + num_labels = int(fc_in_shape[-1]) + # create node with no parallelization first + pe = 1 + assert ( + num_labels % pe == 0 + ), "Requirement Labels divisable by PE is violated." + + k = model.get_initializer(k_input)[0] + + # create and insert new StreamingFCLayer node + new_node = helper.make_node( + "LabelSelect_Batch", + [fc_input], + [idx_output], + domain="finn", + backend="fpgadataflow", + Labels=num_labels, + PE=pe, + K=k, + inputDataType=idt.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferGlobalAccPoolLayer(Transformation): """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul.""" diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py index e0f990600d9ca4be748b662b47ce8296d3d462ce..7197e68be2fbdf5fc39b7ed202e88672614514ec 100644 --- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py +++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py @@ -45,58 +45,89 @@ class CreateDataflowPartition(Transformation): super().__init__() def apply(self, model): - # TODO we currently assume that all dataflow nodes are connected to - # each other, forming a single partition. check the assumption and/or - # improve this. - all_nodes = list(model.graph.node) - df_nodes = filter( - lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes - ) - df_nodes = filter( - lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8") - == "fpgadataflow", - df_nodes, - ) - df_nodes = list(df_nodes) - non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes) - non_df_nodes = list(non_df_nodes) - - if len(df_nodes) == 0: - # no changes if no dataflow nodes are present - return (model, False) - else: - # partition the model into two models - df_model = copy.deepcopy(model) - non_df_model = model - # remove all non-dataflow nodes from the dataflow model - for node_to_remove in non_df_nodes: - df_model.graph.node.remove(node_to_remove) - # identify the entry and exit points for the dataflow part - df_in = df_model.graph.node[0].input[0] - df_out = df_model.graph.node[-1].output[0] - df_in_vi = df_model.get_tensor_valueinfo(df_in) - df_out_vi = df_model.get_tensor_valueinfo(df_out) - # set df graph in/out to be df_in/df_out - df_model.graph.input.remove(df_model.graph.input[0]) - df_model.graph.input.insert(0, df_in_vi) - df_model.graph.output.remove(df_model.graph.output[0]) - df_model.graph.output.insert(0, df_out_vi) - df_model_dir = make_build_dir("dataflow_partition_") - df_model_filename = df_model_dir + "/df_model.onnx" - df_model.save(df_model_filename) - # remove all dataflow nodes from the non-dataflow model - # keep track of where the dataflow part starts - df_start_ind = all_nodes.index(df_nodes[0]) - for node_to_remove in df_nodes: - non_df_model.graph.node.remove(node_to_remove) - # create StreamingDataflow node with df_in/df_out io - df_node = helper.make_node( - "StreamingDataflowPartition", - [df_in], - [df_out], - # use the model attribute to mark the df model - model=df_model_filename, + target_partition_id = 0 + # we currently assume that all dataflow nodes belonging to the same partition + # are connected to each other and there is a single input/output to/from each. + # NOTE: all dataflow nodes with no partition_id set are moved to partition 0 + # TODO: check the assumption and/or improve this. + while True: + all_nodes = list(model.graph.node) + df_nodes = filter( + lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes + ) + df_nodes = filter( + lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8") + == "fpgadataflow" + and ( + get_by_name(x.attribute, "partition_id") is None + or get_by_name(x.attribute, "partition_id").i == target_partition_id + ) + and x.op_type != "StreamingDataflowPartition", + df_nodes, ) - non_df_model.graph.node.insert(df_start_ind, df_node) + df_nodes = list(df_nodes) + non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes) + non_df_nodes = list(non_df_nodes) + + if len(df_nodes) == 0: + # no changes if no dataflow nodes are present + break + else: + # partition the model into two models + df_model = copy.deepcopy(model) + non_df_model = model + # remove all non-dataflow nodes from the dataflow model + for node_to_remove in non_df_nodes: + df_model.graph.node.remove(node_to_remove) + # identify the entry and exit points for the dataflow part + df_in = df_model.graph.node[0].input[0] + df_out = df_model.graph.node[-1].output[0] + df_in_vi = df_model.get_tensor_valueinfo(df_in) + df_out_vi = df_model.get_tensor_valueinfo(df_out) + # set df graph in/out to be df_in/df_out + df_model.graph.input.remove(df_model.graph.input[0]) + df_model.graph.input.insert(0, df_in_vi) + df_model.graph.output.remove(df_model.graph.output[0]) + df_model.graph.output.insert(0, df_out_vi) + # parse StreamingFCLayers looking for external weight memories + fc_extw_nodes = filter( + lambda x: x.op_type == "StreamingFCLayer_Batch" + and get_by_name(x.attribute, "mem_mode") is not None + and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") + == "external", + df_nodes, + ) + fc_extw_nodes = list(fc_extw_nodes) + extra_df_inputs = [] + + for i in range(len(fc_extw_nodes)): + fc_weight_vi = df_model.get_tensor_valueinfo( + fc_extw_nodes[i].input[1] + ) + df_model.graph.input.insert(i + 1, fc_weight_vi) + extra_df_inputs.append(fc_extw_nodes[i].input[1]) + + # save model + df_model_dir = make_build_dir( + "dataflow_partition" + str(target_partition_id) + "_" + ) + df_model_filename = df_model_dir + "/df_model.onnx" + df_model.save(df_model_filename) + # remove all dataflow nodes from the non-dataflow model + # keep track of where the dataflow part starts + df_start_ind = all_nodes.index(df_nodes[0]) + for node_to_remove in df_nodes: + non_df_model.graph.node.remove(node_to_remove) + # create StreamingDataflow node with df_in/df_out io + df_node = helper.make_node( + "StreamingDataflowPartition", + [df_in] + extra_df_inputs, + [df_out], + # use the model attribute to mark the df model + model=df_model_filename, + ) + non_df_model.graph.node.insert(df_start_ind, df_node) + model = non_df_model + target_partition_id += 1 - return (non_df_model, False) + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 0e898f63db785f80cfce2683df0c9b6268e3ec7e..018ad385f33a8e0aea4aa42599fd47fe5dae57dd 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -33,6 +33,8 @@ import subprocess from finn.transformation import Transformation from finn.util.basic import get_by_name, make_build_dir from finn.custom_op.registry import getCustomOp +from finn.util.basic import get_num_default_workers +import multiprocessing as mp class CreateStitchedIP(Transformation): @@ -49,20 +51,137 @@ class CreateStitchedIP(Transformation): The packaged block design IP can be found under the ip subdirectory. """ - def __init__(self, fpgapart, clk_ns = 10.0): + def __init__(self, fpgapart, clk_ns=10.0, ip_name="finn_design", vitis=False): super().__init__() self.fpgapart = fpgapart self.clk_ns = clk_ns + self.ip_name = ip_name + self.vitis = vitis if float(clk_ns) not in [5.0, 10.0, 20.0]: warnings.warn( """The chosen frequency may lead to failure due to clock divider constraints.""" ) + self.has_axilite = False + self.has_aximm = False + self.has_m_axis = False + self.m_axis_idx = 0 + self.has_s_axis = False + self.s_axis_idx = 0 + self.clock_reset_are_external = False + self.create_cmds = [] + self.connect_cmds = [] + # keep track of top-level interface names + self.intf_names = { + "clk": [], + "rst": [], + "s_axis": [], + "m_axis": [], + "aximm": [], + "axilite": [], + } + + def connect_clk_rst(self, node): + inst_name = node.name + node_inst = getCustomOp(node) + clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0] + reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0] + # make clock and reset external, if they aren't already + if not self.clock_reset_are_external: + self.connect_cmds.append( + "make_bd_pins_external [get_bd_pins %s/%s]" + % (inst_name, clock_intf_name) + ) + self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]") + self.connect_cmds.append( + "make_bd_pins_external [get_bd_pins %s/%s]" + % (inst_name, reset_intf_name) + ) + self.connect_cmds.append( + "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]" + ) + self.clock_reset_are_external = True + self.intf_names["clk"] = ["ap_clk"] + self.intf_names["rst"] = ["ap_rst_n"] + # otherwise connect clock and reset + else: + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]" + % (inst_name, reset_intf_name) + ) + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" + % (inst_name, clock_intf_name) + ) + + def connect_axi(self, node): + inst_name = node.name + node_inst = getCustomOp(node) + axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"] + aximm_intf_name = node_inst.get_verilog_top_module_intf_names()["aximm"] + if len(axilite_intf_name) != 0: + self.connect_cmds.append( + "make_bd_intf_pins_external " + "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0]) + ) + self.connect_cmds.append( + "set_property name s_axi_control " "[get_bd_intf_ports s_axi_control_0]" + ) + assert ( + self.has_axilite is False + ), "Currently limited to one slave AXI-Stream" + self.intf_names["axilite"] = ["s_axi_control"] + self.has_axilite = True + if len(aximm_intf_name) != 0: + self.connect_cmds.append( + "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" + % (inst_name, aximm_intf_name[0]) + ) + self.connect_cmds.append( + "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]" + ) + self.intf_names["aximm"] = ["m_axi_gmem0"] + assert self.has_aximm is False, "Currently limited to one AXI-MM interface" + self.has_aximm = True + + def connect_m_axis_external(self, node): + inst_name = node.name + node_inst = getCustomOp(node) + output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"] + # make output axis external + for output_intf_name in output_intf_names: + self.connect_cmds.append( + "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" + % (inst_name, output_intf_name) + ) + self.connect_cmds.append( + "set_property name m_axis_%d [get_bd_intf_ports %s_0]" + % (self.m_axis_idx, output_intf_name) + ) + self.has_m_axis = True + self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx) + self.m_axis_idx += 1 + + def connect_s_axis_external(self, node): + inst_name = node.name + node_inst = getCustomOp(node) + input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"] + # make input axis external + for input_intf_name in input_intf_names: + self.connect_cmds.append( + "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" + % (inst_name, input_intf_name) + ) + self.connect_cmds.append( + "set_property name s_axis_%d [get_bd_intf_ports %s_0]" + % (self.s_axis_idx, input_intf_name) + ) + self.has_s_axis = True + self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx) + self.s_axis_idx += 1 def apply(self, model): ip_dirs = ["list"] - create_cmds = [] - connect_cmds = [] # ensure that all nodes are fpgadataflow, and that IPs are generated for node in model.graph.node: assert node.domain == "finn", 'Node domain is not set to "finn"' @@ -80,59 +199,62 @@ class CreateStitchedIP(Transformation): vlnv = node_inst.get_nodeattr("ip_vlnv") inst_name = node.name create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name) - create_cmds += [create_cmd] - # TODO nonlinear topologies: check this for all inputs + self.create_cmds += [create_cmd] my_producer = model.find_producer(node.input[0]) + self.connect_clk_rst(node) + self.connect_axi(node) if my_producer is None: # first node in graph - # make clock and reset external - connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/ap_clk]" % inst_name - ) - connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/ap_rst_n]" % inst_name - ) - # make input external - connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/in0_V_V]" - % inst_name - ) + self.connect_s_axis_external(node) + if node.op_type == "TLastMarker": + assert ( + node_inst.get_nodeattr("Direction") == "in" + ), """Output TLastMarker incorrect direction""" + elif node.op_type == "IODMA": + assert ( + node_inst.get_nodeattr("direction") == "in" + ), """Input DMA incorrect direction""" else: # intermediate node - # wire up global clock and reset - connect_cmds.append( - "connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins %s/ap_rst_n]" - % inst_name - ) - connect_cmds.append( - "connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins %s/ap_clk]" - % inst_name - ) - # wire up input to previous output - # TODO nonlinear topologies: loop over all inputs - my_in_name = "%s/in0_V_V" % (inst_name) - prev_out_name = "%s/out_V_V" % (my_producer.name) - connect_cmds.append( - "connect_bd_intf_net [get_bd_intf_pins %s] [get_bd_intf_pins %s]" - % (prev_out_name, my_in_name) - ) - if model.find_consumer(node.output[0]) is None: + # wire up input(s) to previous node output(s) + # foreach input + # find producer + # find index of producer output connected to our target input + # get names of hdl interfaces for input and producer output + # issue a TCL directive to connect input to output + for i in range(len(node.input)): + producer = model.find_producer(node.input[i]) + if producer is None: + continue + j = list(producer.output).index(node.input[i]) + src_intf_name = getCustomOp( + producer + ).get_verilog_top_module_intf_names()["m_axis"][j] + dst_intf_name = node_inst.get_verilog_top_module_intf_names()[ + "s_axis" + ][i] + self.connect_cmds.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s]" + % (producer.name, src_intf_name, node.name, dst_intf_name) + ) + if model.find_consumers(node.output[0]) is None: # last node in graph + self.connect_m_axis_external(node) # ensure it is a TLastMarker to have a valid TLast signal assert ( - node.op_type == "TLastMarker" - ), """Last node is not TLastMarker. - Please run transformation InsertTLastMarker to ensure a valid - TLast signal""" - # make output external - connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/out_r]" % inst_name - ) - # make AXI lite IF external - connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi_control]" - % inst_name - ) + node.op_type == "TLastMarker" or node.op_type == "IODMA" + ), """Last node is not TLastMarker or DMA. + Please run transformation InsertTLastMarker/InsertIODMA to ensure + a valid TLast signal""" + if node.op_type == "TLastMarker": + assert ( + node_inst.get_nodeattr("Direction") == "out" + ), """Output TLastMarker incorrect direction""" + elif node.op_type == "IODMA": + assert ( + node_inst.get_nodeattr("direction") == "out" + ), """Output DMA incorrect direction""" # create a temporary folder for the project prjname = "finn_vivado_stitch_proj" @@ -150,22 +272,54 @@ class CreateStitchedIP(Transformation): tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str) tcl.append("update_ip_catalog") # create block design and instantiate all layers - block_name = "finn_design" + block_name = self.ip_name tcl.append('create_bd_design "%s"' % block_name) - tcl.extend(create_cmds) - tcl.extend(connect_cmds) + tcl.extend(self.create_cmds) + tcl.extend(self.connect_cmds) fclk_mhz = 1 / (self.clk_ns * 0.001) fclk_hz = fclk_mhz * 1000000 model.set_metadata_prop("clk_ns", str(self.clk_ns)) - tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk_0]" % fclk_hz) + tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk]" % fclk_hz) tcl.append("regenerate_bd_layout") tcl.append("validate_bd_design") tcl.append("save_bd_design") + # create wrapper hdl (for rtlsim later on) + bd_base = "%s/%s.srcs/sources_1/bd/%s" % ( + vivado_stitch_proj_dir, + prjname, + block_name, + ) + bd_filename = "%s/%s.bd" % (bd_base, block_name) + tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename) + wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name) + tcl.append("add_files -norecurse %s" % wrapper_filename) + model.set_metadata_prop("wrapper_filename", wrapper_filename) + # synthesize to DCP and export stub, DCP and constraints + if self.vitis: + tcl.append( + "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]" + % bd_filename + ) + tcl.append( + "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} " + "-value {-mode out_of_context} -objects [get_runs synth_1]" + ) + num_workers = get_num_default_workers() + assert num_workers >= 0, "Number of workers must be nonnegative." + if num_workers == 0: + num_workers = mp.cpu_count() + tcl.append("launch_runs synth_1 -jobs %s" % str(num_workers)) + tcl.append("wait_on_run [get_runs synth_1]") + tcl.append("open_run synth_1 -name synth_1") + tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name) + tcl.append("write_checkpoint %s.dcp" % block_name) + tcl.append("write_xdc %s.xdc" % block_name) # export block design itself as an IP core block_vendor = "xilinx_finn" block_library = "finn" block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name) model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv) + model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names)) tcl.append( ( "ipx::package_project -root_dir %s/ip -vendor %s " @@ -175,19 +329,89 @@ class CreateStitchedIP(Transformation): ) tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv) tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv) + # if targeting Vitis, add some properties to the IP + if self.vitis: + tcl.append( + "ipx::remove_bus_parameter FREQ_HZ " + "[ipx::get_bus_interfaces CLK.AP_CLK -of_objects [ipx::current_core]]" + ) + # replace source code with dcp + tcl.append( + "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv + ) + tcl.append( + "set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv + ) + tcl.append( + "set_property supported_families { } [ipx::find_open_core %s]" + % block_vlnv + ) + tcl.append( + "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} " + "[ipx::find_open_core %s]" % block_vlnv + ) + tcl.append( + "set_property auto_family_support_level level_2 " + "[ipx::find_open_core %s]" % block_vlnv + ) + # remove all files from synthesis and sim groups + # we'll replace with DCP, stub, and xdc + tcl.append( + "ipx::remove_all_file " + "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]" + ) + tcl.append( + "ipx::remove_all_file " + "[ipx::get_file_groups xilinx_anylanguagesynthesis]" + ) + tcl.append( + "ipx::remove_file_group " + "xilinx_anylanguagebehavioralsimulation [ipx::current_core]" + ) + tcl.append( + "ipx::remove_file_group " + "xilinx_anylanguagesynthesis [ipx::current_core]" + ) + # remove sim and src folders + tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir) + tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir) + # copy and add DCP, stub, and xdc + tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir) + tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir) + tcl.append( + "file copy -force %s.dcp %s/ip/dcp" + % (block_name, vivado_stitch_proj_dir) + ) + tcl.append( + "file copy -force %s.xdc %s/ip/impl" + % (block_name, vivado_stitch_proj_dir) + ) + tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]") + tcl.append( + "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]" + % block_name + ) + tcl.append( + "set_property used_in [list implementation] " + "[ipx::get_files impl/%s.xdc " + "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name + ) + tcl.append( + "ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]" + ) + tcl.append( + "ipx::add_file dcp/%s.dcp " + "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name + ) + tcl.append( + "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]" + ) + tcl.append( + "ipx::add_file dcp/%s.dcp " + "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name + ) tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv) tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv) - # create wrapper hdl (for rtlsim later on) - bd_base = "%s/%s.srcs/sources_1/bd/%s" % ( - vivado_stitch_proj_dir, - prjname, - block_name, - ) - bd_filename = "%s/%s.bd" % (bd_base, block_name) - tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename) - wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name) - tcl.append("add_files -norecurse %s" % wrapper_filename) - model.set_metadata_prop("wrapper_filename", wrapper_filename) # export list of used Verilog files (for rtlsim later on) tcl.append("set all_v_files [get_files -filter {FILE_TYPE == Verilog}]") v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index b01f8cbe5c48db6c5288b2db1a8b009ea09ce6c0..85a2d47be0599a852b223f1a65d3ec04efe9bda7 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -118,8 +118,11 @@ class InsertFIFO(Transformation): graph_modified = True if graph_modified is False: - # insert FIFO as first node - if graph.node[0].op_type != "StreamingFIFO": + # insert FIFO as first node, except when first node is DMA + if ( + graph.node[0].op_type != "StreamingFIFO" + and graph.node[0].op_type != "IODMA" + ): n = graph.node[0] n_input = n.input[0] n0 = getCustomOp(n) @@ -153,8 +156,11 @@ class InsertFIFO(Transformation): # set fifo output tensor as new input tensor of second node n.input[0] = fifo_output_tensor.name - # insert FIFO as last node - if graph.node[-1].op_type != "StreamingFIFO": + # insert FIFO as last node, except when last node is DMA + if ( + graph.node[-1].op_type != "StreamingFIFO" + and graph.node[0].op_type != "IODMA" + ): n = graph.node[-1] assert ( n.op_type != "TLastMarker" diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index e4368edea717f7499481e9b1c6ac20f7d5bb5f58..0cd7c0d4d41accf8cdba8adfaf4dbb00fc0cab7a 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -171,6 +171,7 @@ class InsertIODMA(Transformation): # calculate width of stream output from DMA pe = get_by_name(fc_node.attribute, "PE").i simd = get_by_name(fc_node.attribute, "SIMD").i + assert pe * simd == w_shape[0], "Malformed weight matrix" streamWidth = simd * pe * w_dtype.bitwidth() # make new buffer fc_node_in = oh.make_tensor_value_info( @@ -178,12 +179,13 @@ class InsertIODMA(Transformation): ) model.graph.value_info.append(fc_node_in) model.set_tensor_datatype(fc_node_in.name, w_dtype) + model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name)) dma_node = oh.make_node( "IODMA", [fc_w_name], [fc_node_in.name], - numInputVectors=w_shape[:-1], - NumChannels=w_shape[-1], + numInputVectors=[w_shape[1]], + NumChannels=w_shape[0], dataType=str(w_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 04dd437af27b9fbe18b2255c20a8e4acda03b3d0..bbb0e43fda464e919a7d8c9dcd25e08a49b33cec 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -38,7 +38,8 @@ import numpy as np class InsertTLastMarker(Transformation): """Ensure that the graph is started/terminated with a TLastMarker node, inserting - one if necessary. Use constructor args to determine type of TLastMarker to be inserted. + one if necessary. + Use constructor args to determine type of TLastMarker to be inserted. More information available on the TLastMarker documentation. """ @@ -90,41 +91,78 @@ class InsertTLastMarker(Transformation): graph_modified = True # if both is True, also insert marker on input if self.both: - graph_in_name = model.graph.input[0].name - first_node = model.find_consumer(graph_in_name) - if first_node.op_type != "TLastMarker" and not ( - first_node.op_type == "IODMA" - and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") - == "in" - ): + # detect and parse graph inputs + insert_idx = 0 + graph_in_names = [x.name for x in model.graph.input] + for graph_in_name in graph_in_names: + first_node = model.find_consumers(graph_in_name) + # skip if no consumers (this may be the case for unused initializers) + # TODO: fix this with a cleanup transform + if first_node is None: + continue + assert len(first_node) == 1, "Input fans out to multiple nodes" + first_node = first_node[0] + # several scenarios exclude the node: + # 1. node is a FC layer with internal weights, in which case + # the input is in the list of graph inputs because it has an + # initializer (TODO: fix this with a clean-up transform) + if ( + first_node.op_type == "StreamingFCLayer_Batch" + and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") + != "external" + ): + continue + # 2. node is either a TLastMarker or an input IODMA + if first_node.op_type != "TLastMarker" and not ( + first_node.op_type == "IODMA" + and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") + == "in" + ): - custom_op = getCustomOp(first_node) - num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) - stream_width = int(custom_op.get_instream_width()) - in_shape = model.get_tensor_shape(graph_in_name) - in_dtype = model.get_tensor_datatype(graph_in_name) - elem_width = in_dtype.bitwidth() - # make new buffer - first_node_in = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape - ) - model.graph.value_info.append(first_node_in) - model.set_tensor_datatype(first_node_in.name, in_dtype) - # reroute final node output to first_node_in_name - first_node.input[0] = first_node_in.name - tlast_node = oh.make_node( - "TLastMarker", - [graph_in_name], - [first_node_in.name], - NumIters=num_iters, - StreamWidth=stream_width, - ElemWidth=elem_width, - DynIters=(1 if self.dyniters else 0), - Direction="in", - Protocol=("external" if self.external else "internal"), - domain="finn", - backend="fpgadataflow", - ) - model.graph.node.insert(0, tlast_node) - graph_modified = True + custom_op = getCustomOp(first_node) + num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) + inp_idx = list(first_node.input).index(graph_in_name) + if inp_idx > 0: + if ( + first_node.op_type == "StreamingFCLayer_Batch" + and inp_idx == 1 + ): + stream_width = int(custom_op.get_weightstream_width()) + elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: + stream_width = int(custom_op.get_instream_width()) + else: + raise Exception("No method to determine stream width") + else: + stream_width = int(custom_op.get_instream_width()) + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + elem_width = in_dtype.bitwidth() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + ini = model.get_initializer(graph_in_name) + # copy initializer if it exists + if ini is not None: + model.set_initializer(first_node_in.name, ini) + # reroute final node output to first_node_in_name + first_node.input[inp_idx] = first_node_in.name + tlast_node = oh.make_node( + "TLastMarker", + [graph_in_name], + [first_node_in.name], + NumIters=num_iters, + StreamWidth=stream_width, + ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="in", + Protocol=("external" if self.external else "internal"), + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(insert_idx, tlast_node) + graph_modified = True + insert_idx += 1 return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 18d3db18da089a5dda4dbb6d97180dd4a20613b5..1e45a65720604144f67245b98dcbe3f6dc8363f5 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -28,6 +28,7 @@ import os import shutil +import warnings from finn.custom_op.registry import getCustomOp from finn.transformation import Transformation @@ -53,7 +54,7 @@ class MakePYNQDriver(Transformation): def apply(self, model): vivado_pynq_proj = model.get_metadata_prop("vivado_pynq_proj") if vivado_pynq_proj is None or (not os.path.isdir(vivado_pynq_proj)): - raise Exception("No PYNQ project found, apply MakePYNQProject first.") + warnings.warn("No PYNQ project found, apply MakePYNQProject first.") # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") @@ -108,7 +109,12 @@ class MakePYNQDriver(Transformation): driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) # clock settings for driver - clk_ns = float(model.get_metadata_prop("clk_ns")) + clk_ns = model.get_metadata_prop("clk_ns") + # default to 10ns / 100 MHz if property not set + if clk_ns is None: + clk_ns = 10.0 + else: + clk_ns = float(clk_ns) fclk_mhz = 1 / (clk_ns * 0.001) # TODO change according to PYNQ board? driver = driver.replace("$CLK_NAME$", "fclk0_mhz") diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py index 91f6bd2c4ab19c736fcf21322979cac17a163f24..a874d7a7c702e1b3e9125fc031aa65dc287a407d 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py @@ -67,6 +67,16 @@ class MakePYNQProject(Transformation): raise Exception( "No vlnv for stitched IP found, apply CreateStitchedIP first." ) + vivado_stitch_ifnames = model.get_metadata_prop("vivado_stitch_ifnames") + if vivado_stitch_ifnames is None: + raise Exception("No IF name metadata found, apply CreateStitchedIP first.") + vivado_stitch_ifnames = eval(vivado_stitch_ifnames) + # recover interface names from dict + self.clk_name = vivado_stitch_ifnames["clk"][0] + self.rst_name = vivado_stitch_ifnames["rst"][0] + self.s_axis_if_name = vivado_stitch_ifnames["s_axis"][0] + self.m_axis_if_name = vivado_stitch_ifnames["m_axis"][0] + self.s_aximm_if_name = vivado_stitch_ifnames["axilite"][0] # collect list of all IP dirs ip_dirs = ["list"] @@ -105,11 +115,11 @@ class MakePYNQProject(Transformation): multiple of 8.""" in_bytes = i_bits_per_cycle_padded / 8 out_bytes = o_bits_per_cycle_padded / 8 - in_if_name = "in0_V_V_0" - out_if_name = "out_r_0" - clk_name = "ap_clk_0" - nrst_name = "ap_rst_n_0" - axi_lite_if_name = "s_axi_control_0" + in_if_name = self.s_axis_if_name + out_if_name = self.m_axis_if_name + clk_name = self.clk_name + nrst_name = self.rst_name + axi_lite_if_name = self.s_aximm_if_name vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="") # create a temporary folder for the project diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py index 1d49970c819961d1794cc89e998108639ca15593..8fd7e4724ef7f255b1435d5ab5e680d155d39487 100644 --- a/src/finn/transformation/fpgadataflow/synth_ooc.py +++ b/src/finn/transformation/fpgadataflow/synth_ooc.py @@ -37,7 +37,7 @@ from finn.util.basic import make_build_dir class SynthOutOfContext(Transformation): """Run out-of-context Vivado synthesis on a stitched IP design.""" - def __init__(self, part, clk_period_ns, clk_name="ap_clk_0"): + def __init__(self, part, clk_period_ns, clk_name="ap_clk"): super().__init__() self.part = part self.clk_period_ns = clk_period_ns diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index e4da964552d15543ea93df4fbf01ddab7eb7f6f2..4b91b5c33f032ae1664163ab0ae1cacdf8b91826 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -104,9 +104,10 @@ from finn.core.datatype import DataType from pynq.ps import Clocks class FINNAccelDriver(): - def __init__(self, N, bitfile): + def __init__(self, N, bitfile, platform="zynq"): \"\"\"Instantiate the FINN accelerator driver. Gets batchsize (N) as integer and path to bitfile as string.\"\"\" + self.platform = platform self.N = N # input FINN DataType self.idt = $INPUT_FINN_DATATYPE$ @@ -119,21 +120,29 @@ class FINNAccelDriver(): self.oshape_folded = $OUTPUT_SHAPE_FOLDED$ self.ishape_packed = $INPUT_SHAPE_PACKED$ # datatype np.uint8 self.oshape_packed = $OUTPUT_SHAPE_PACKED$ # datatype np.uint8 - # clock frequency - self.fclk_mhz = $CLOCK_FREQ_MHZ$ # load bitfile and set up accelerator self.ol = Overlay(bitfile) - # set the clock frequency as specified by user during transformations - Clocks.$CLK_NAME$ = self.fclk_mhz - self.dma = self.ol.axi_dma_0 - self.ctrl_regs = self.ol.resize_accel_0 # neuron folding factor of output = iterations per sample self.itersPerSample = self.oshape_packed[-2] - # AXI lite register offset for number of iterations - # used by TLastMarker to signal end of transmission for AXI CDMA - self.REG_OFFSET_NUM_ITERS = 0x10 - # set up TLastMarker with correct num. samples - self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample) + if self.platform == "zynq": + # clock frequency + self.fclk_mhz = $CLOCK_FREQ_MHZ$ + # set the clock frequency as specified by user during transformations + if self.fclk_mhz > 0: + Clocks.$CLK_NAME$ = self.fclk_mhz + self.dma = self.ol.axi_dma_0 + self.ctrl_regs = self.ol.resize_accel_0 + + # AXI lite register offset for number of iterations + # used by TLastMarker to signal end of transmission for AXI CDMA + self.REG_OFFSET_NUM_ITERS = 0x10 + # set up TLastMarker with correct num. samples + self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample) + elif self.platform == "alveo": + self.idma = self.ol.idma0 + self.odma = self.ol.odma0 + else: + raise ValueError("Supported platforms are zynq and alveo") # allocate a PYNQ buffer for the packed input and buffer self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) @@ -176,19 +185,29 @@ class FINNAccelDriver(): np.copyto(self.ibuf_packed_device, data) def execute(self): - \"\"\"Executes accelerator by setting up the DMA and - waiting until all transfers complete. Uses only member variables and + \"\"\"Executes accelerator by setting up the DMA(s) and + waiting until all transfers/calls complete. Uses only member variables and returns nothing.\"\"\" - dma = self.dma - dma.sendchannel.transfer(self.ibuf_packed_device) - dma.recvchannel.transfer(self.obuf_packed_device) - dma.sendchannel.wait() - dma.recvchannel.wait() + if self.platform == "zynq": + dma = self.dma + dma.sendchannel.transfer(self.ibuf_packed_device) + dma.recvchannel.transfer(self.obuf_packed_device) + dma.sendchannel.wait() + dma.recvchannel.wait() + else: + self.ibuf_packed_device.sync_to_device() + self.idma.start(self.ibuf_packed_device, self.N) + self.odma.start(self.obuf_packed_device, self.N) + self.idma.wait() + self.odma.wait() + self.obuf_packed_device.sync_from_device() + if __name__ == "__main__": parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name') parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute") + parser.add_argument('--platform', help='Target platform, zynq or alveo', default="zynq") parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1) parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit") parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy") @@ -196,13 +215,14 @@ if __name__ == "__main__": # parse arguments args = parser.parse_args() exec_mode = args.exec_mode + platform = args.platform N = args.batchsize bitfile = args.bitfile inputfile = args.inputfile outputfile = args.outputfile # instantiate FINN accelerator driver and pass batchsize and bitfile - finnDriver = FINNAccelDriver(N, bitfile) + finnDriver = FINNAccelDriver(N, bitfile, platform) # for the remote execution the data from the input npy file has to be loaded, # packed and copied to the PYNQ buffer diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py new file mode 100644 index 0000000000000000000000000000000000000000..ae529f2f4a165a732627befea0675073bc490996 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -0,0 +1,309 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import subprocess + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation import Transformation +from finn.custom_op.registry import getCustomOp + +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker +from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.floorplan import Floorplan +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.util.basic import make_build_dir +from finn.transformation.infer_data_layouts import InferDataLayouts + +def _check_vitis_envvars(): + assert "VITIS_PATH" in os.environ, "VITIS_PATH must be set for Vitis" + assert "PLATFORM_REPO_PATHS" in os.environ, "PLATFORM_REPO_PATHS must be set for Vitis" + assert "XILINX_XRT" in os.environ, "XILINX_XRT must be set for Vitis, ensure the XRT env is sourced" + +class CreateVitisXO(Transformation): + """Create a Vitis object file from a stitched FINN ip. + + Outcome if successful: sets the vitis_xo attribute in the ONNX + ModelProto's metadata_props field with the name of the object file as value. + The object file can be found under the ip subdirectory. + """ + + def __init__(self, ip_name="finn_design"): + super().__init__() + self.ip_name = ip_name + + def apply(self, model): + _check_vitis_envvars() + vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj") + stitched_ip_dir = vivado_proj_dir + "/ip" + args_string = [] + m_axis_idx = 0 + s_axis_idx = 0 + # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface + # developed from instructions in UG1393 (v2019.2) and package_xo documentation + # package_xo is responsible for generating the kernel xml + for node in model.graph.node: + node_inst = getCustomOp(node) + arg_id = 0 + if node.op_type == "TLastMarker": + stream_width = node_inst.get_nodeattr("StreamWidth") + # add a stream input or output port, based on direction + if node_inst.get_nodeattr("Direction") == "in": + args_string.append( + "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint<%s>:0}" + % (str(arg_id), s_axis_idx, str(stream_width)) + ) + s_axis_idx += 1 + else: + args_string.append( + "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint<%s>:0}" + % (str(arg_id), m_axis_idx, str(stream_width)) + ) + m_axis_idx += 1 + arg_id += 1 + # add a axilite port if dynamic + # add a count parameter if dynamic + if node_inst.get_nodeattr("DynIters") == 1: + args_string.append( + "{numReps:0:%s:s_axi_control:0x4:0x10:uint:0}" % str(arg_id) + ) + arg_id += 1 + elif node.op_type == "IODMA": + port_width = node_inst.get_nodeattr("intfWidth") + # add an address parameter + # add a count parameter + args_string.append( + "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint<%s>*:0}" + % (str(arg_id), str(port_width)) + ) + arg_id += 1 + args_string.append( + "{numReps:0:%s:s_axi_control:0x4:0x1C:uint:0}" % str(arg_id) + ) + arg_id += 1 + + # save kernel xml then run package_xo + xo_name = self.ip_name + ".xo" + xo_path = vivado_proj_dir + "/" + xo_name + model.set_metadata_prop("vitis_xo", xo_path) + + # generate the package_xo command in a tcl script + package_xo_string = ( + "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s" + % (xo_path, self.ip_name, stitched_ip_dir) + ) + for arg in args_string: + package_xo_string += " -kernel_xml_args " + arg + with open(vivado_proj_dir + "/gen_xo.tcl", "w") as f: + f.write(package_xo_string) + + # create a shell script and call Vivado + package_xo_sh = vivado_proj_dir + "/gen_xo.sh" + working_dir = os.environ["PWD"] + with open(package_xo_sh, "w") as f: + f.write("#!/bin/bash \n") + f.write("cd {}\n".format(vivado_proj_dir)) + f.write("vivado -mode batch -source gen_xo.tcl\n") + f.write("cd {}\n".format(working_dir)) + bash_command = ["bash", package_xo_sh] + process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) + process_compile.communicate() + assert os.path.isfile(xo_path), "Vitis .xo file not created, check logs under %s" % vivado_proj_dir + return (model, False) + + +class VitisLink(Transformation): + """Create an XCLBIN with Vitis. + + Outcome if successful: sets the vitis_xclbin attribute in the ONNX + ModelProto's metadata_props field with the XCLBIN full path as value. + """ + + def __init__(self, platform, f_mhz=200): + super().__init__() + self.platform = platform + self.f_mhz = f_mhz + + def apply(self, model): + _check_vitis_envvars() + # create a config file and empty list of xo files + config = ["[connectivity]"] + object_files = [] + idma_idx = 0 + odma_idx = 0 + instance_names = {} + for node in model.graph.node: + assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" + sdp_node = getCustomOp(node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + kernel_xo = kernel_model.get_metadata_prop("vitis_xo") + object_files.append(kernel_xo) + # gather info on connectivity + # assume each node connected to outputs/inputs is DMA: + # has axis, aximm and axilite + # everything else is axis-only + # assume only one connection from each ip to the next + # all aximm allocated to DDR[0] + # all kernels allocated to SLR0 + producer = model.find_producer(node.input[0]) + consumer = model.find_consumers(node.output[0]) + # define kernel instances + # name kernels connected to graph inputs as idmaxx + # name kernels connected to graph inputs as odmaxx + if producer is None: + instance_names[node.name] = "idma" + str(idma_idx) + config.append("nk=%s:1:%s" % (node.name, instance_names[node.name])) + idma_idx += 1 + elif consumer is None: + instance_names[node.name] = "odma" + str(odma_idx) + config.append("nk=%s:1:%s" % (node.name, instance_names[node.name])) + odma_idx += 1 + else: + instance_names[node.name] = node.name + config.append("nk=%s:1:%s" % (node.name, instance_names[node.name])) + # assign SLRs + config.append("slr=%s:SLR0" % instance_names[node.name]) + # assign memory banks + if producer is None or consumer is None: + config.append( + "sp=%s.m_axi_gmem0:DDR[%d]" % (instance_names[node.name], 0) + ) + # connect streams + if producer is not None: + for i in range(len(node.input)): + producer = model.find_producer(node.input[i]) + if producer is not None: + j = list(producer.output).index(node.input[i]) + config.append( + "stream_connect=%s.m_axis_%d:%s.s_axis_%d" + % ( + instance_names[producer.name], + j, + instance_names[node.name], + i, + ) + ) + + # create a temporary folder for the project + link_dir = make_build_dir(prefix="vitis_link_proj_") + model.set_metadata_prop("vitis_link_proj", link_dir) + + config = "\n".join(config) + "\n" + with open(link_dir + "/config.txt", "w") as f: + f.write(config) + + # create a shell script and call Vitis + script = link_dir + "/run_vitis_link.sh" + working_dir = os.environ["PWD"] + with open(script, "w") as f: + f.write("#!/bin/bash \n") + f.write("cd {}\n".format(link_dir)) + f.write( + "v++ -t hw --platform %s --link %s" + " --kernel_frequency %d --config config.txt\n" + % (self.platform, " ".join(object_files), self.f_mhz) + ) + f.write("cd {}\n".format(working_dir)) + bash_command = ["bash", script] + process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) + process_compile.communicate() + # TODO rename xclbin appropriately here? + xclbin = link_dir + "/a.xclbin" + assert os.path.isfile(xclbin), "Vitis .xclbin file not created, check logs under %s" % link_dir + model.set_metadata_prop("vitis_xclbin", xclbin) + return (model, False) + + +class VitisBuild(Transformation): + """Best-effort attempt at building the accelerator with Vitis.""" + + def __init__(self, fpga_part, period_ns, platform): + super().__init__() + self.fpga_part = fpga_part + self.period_ns = period_ns + self.platform = platform + + def apply(self, model): + _check_vitis_envvars() + # first infer layouts + model = model.transform(InferDataLayouts()) + # prepare at global level, then break up into kernels + prep_transforms = [ + MakePYNQDriver(), + InsertIODMA(512), + InsertDWC(), + Floorplan(), + CreateDataflowPartition(), + ] + for trn in prep_transforms: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + # Build each kernel individually + sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition") + for sdp_node in sdp_nodes: + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform( + InsertTLastMarker(both=True, external=False, dynamic=False) + ) + kernel_model = kernel_model.transform(GiveUniqueNodeNames()) + kernel_model.save(dataflow_model_filename) + kernel_model = kernel_model.transform( + PrepareIP(self.fpga_part, self.period_ns) + ) + kernel_model = kernel_model.transform(HLSSynthIP()) + kernel_model = kernel_model.transform(ReplaceVerilogRelPaths()) + kernel_model = kernel_model.transform( + CreateStitchedIP( + self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True + ) + ) + kernel_model = kernel_model.transform( + CreateVitisXO(sdp_node.onnx_node.name) + ) + kernel_model.save(dataflow_model_filename) + # Assemble design from kernels + model = model.transform(VitisLink(self.platform, round(1000 / self.period_ns))) + + return (model, False) diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py index 2ddaf4f840f449d3f5ec5cb83eaf461d624eb7a2..9943d371dad79a977b61810bcddafdcba505d6cc 100644 --- a/src/finn/transformation/move_reshape.py +++ b/src/finn/transformation/move_reshape.py @@ -36,5 +36,15 @@ class RemoveCNVtoFCFlatten(Transformation): graph_modified = True consumer.input[0] = n.input[0] graph.node.remove(n) + elif producer.op_type == "Transpose": + transp_node = producer + producer = model.find_producer(transp_node.input[0]) + if _is_fpgadataflow_node(producer) is True: + consumer = model.find_consumer(n.output[0]) + if _is_fpgadataflow_node(consumer) is True: + graph_modified = True + consumer.input[0] = transp_node.input[0] + graph.node.remove(n) + graph.node.remove(transp_node) return (model, graph_modified) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 2b03532ce3ba7d5159e5ae57e61c2af9c8c37fce..b47f269dd6f2671c3d98c9316954483c0e72f14f 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -502,6 +502,73 @@ class MoveLinearPastEltwiseAdd(Transformation): return (model, graph_modified) +class MoveScalarLinearPastInvariants(Transformation): + """Move scalar linear operations (mul, add) past functions which are invariant + to them. Specifically, matches and transforms the following patterns: + f(x*C) -> f(x) * C + f(x+C) -> f(x) + C + where x is a dynamic input, C is a constant tensor. + Known f which obey this property are: Reshape, Flatten, Transpose, + GlobalAveragePool + """ + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + nodes = [n for n in graph.node] + for n in nodes: + node_ind += 1 + if ( + n.op_type == "GlobalAveragePool" + or n.op_type == "Reshape" + or n.op_type == "Transpose" + or n.op_type == "Flatten" + ): + in0 = n.input[0] + if in0 is None: + continue + # find and check producer on our input + prod0 = model.find_producer(in0) + if prod0 is None: + continue + + if prod0.op_type == "Mul" or prod0.op_type == "Add": + # check if second input of producer is an initializer + init0 = model.get_initializer(prod0.input[1]) + # if either initializer is None, skip + if init0 is None: + continue + # if initializer is not scalar, skip + if np.prod(init0.shape) != 1: + continue + # move prod0 from input to output, + old_prod0_in = prod0.input[0] + old_prod0_out = prod0.output[0] + scalar_op_odt = model.get_tensor_datatype(old_prod0_out) + old_n_out = n.output[0] + in_shape = model.get_tensor_shape(n.input[0]) + out_shape = model.get_tensor_shape(n.output[0]) + n.input[0] = old_prod0_in + n.output[0] = old_prod0_out + prod0.input[0] = old_prod0_out + prod0.output[0] = old_n_out + model.set_tensor_shape(n.input[0], in_shape) + model.set_tensor_shape(n.output[0], out_shape) + model.set_tensor_shape(prod0.output[0], out_shape) + model.set_tensor_datatype(prod0.output[0], scalar_op_odt) + model.set_tensor_datatype(n.output[0], DataType.FLOAT32) + graph.node.remove(prod0) + graph.node.insert(node_ind - 1, prod0) + graph_modified = True + else: + continue + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class MakeMaxPoolNHWC(Transformation): """Convert (MaxPool, NHWCTranpose) into (MaxPoolNHWC).""" @@ -685,6 +752,7 @@ class MoveMaxPoolPastMultiThreshold(Transformation): model = model.transform(InferShapes()) return (model, graph_modified) + class MoveFlattenPastTopK(Transformation): """Move flatten node past a succeeding topk node, if the "axis" attribute in topk is set to -1 and the data layout before the flatten is NHWC with H=W=1""" @@ -745,6 +813,7 @@ class MoveFlattenPastTopK(Transformation): model = model.transform(InferShapes()) return (model, graph_modified) + class MoveFlattenPastAffine(Transformation): """Moves a node that implements a (1, -1) reshape past a MatMul, Mul or Add node.""" @@ -831,9 +900,10 @@ class MoveFlattenPastAffine(Transformation): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) - model = model.transform(InferDataLayouts()) + model = model.transform(InferDataLayouts()) return (model, graph_modified) - + + class MoveTransposePastScalarMul(Transformation): """Moves a Transpose node past a scalar Mul node""" @@ -895,4 +965,3 @@ class MoveTransposePastScalarMul(Transformation): model = model.transform(InferDataLayouts()) model = model.transform(InferShapes()) return (model, graph_modified) - diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 4a8277e08d3fc21e0b20668edf2ecad947b36647..91ff811069369383099f5ae5aebf3228fbdbaae5 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -51,6 +51,19 @@ pynq_native_port_width["Pynq-Z2"] = 64 pynq_native_port_width["Ultra96"] = 128 pynq_native_port_width["ZCU104"] = 128 +# Alveo device and platform mappings +alveo_part_map = dict() +alveo_part_map["U50"] = "xcu50-fsvh2104-2L-e" +alveo_part_map["U200"] = "xcu200-fsgd2104-2-e" +alveo_part_map["U250"] = "xcu250-figd2104-2L-e" +alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e" + +alveo_default_platform = dict() +alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3" +alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2" +alveo_default_platform["U250"] = "xilinx_u250_xdma_201830_2" +alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3" + def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py index 22c356a5869b25fcc7ae3ef0164ed61b53ef232c..188f20e22fc52e435f8ba0e7d76dff223e084d69 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -23,6 +23,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.custom_op.im2col import compute_conv_output_dim +from finn.custom_op.registry import getCustomOp # conv_config kernel_size,stride, pad @@ -110,3 +111,8 @@ def test_convert_to_hls_conv_layer(conv_config, exec_mode): assert oxe.compare_execution(model, new_model, inp_dict) if kernel_size == 1 and stride > 1 and pad == 0: assert new_model.graph.node[1].op_type == "DownSampler" + + if pad == 1: + padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_inst = getCustomOp(padding_node) + assert padding_inst.get_nodeattr("SIMD") == in_chn diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py new file mode 100644 index 0000000000000000000000000000000000000000..9d861929f3d421c431a27ccac5d513938aa7d726 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py @@ -0,0 +1,232 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import numpy as np + +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fold_constants import FoldConstants +from finn.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + SortGraph, +) +from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.util.basic import gen_finn_dt_tensor +from finn.util.test import soft_verify_topk +from finn.transformation.double_to_single_float import DoubleToSingleFloat +from finn.transformation.insert_topk import InsertTopK +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.streamline.absorb import ( + AbsorbScalarMulIntoTopK, + AbsorbConsecutiveTransposes, +) +from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedMul, + CollapseRepeatedAdd, +) +from finn.transformation.streamline.reorder import MoveAddPastMul + +import pytest + +export_onnx_path = "test_output_synthetic.onnx" + +# construct a synthetic graph to test: +# topk insertion, topk conversion to hls, add conversion to hls +# graph should just be a sum + + +def make_model(ch, ifmdim): + shape = [1, ch, ifmdim, ifmdim] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + inp1_add0_ct = helper.make_tensor_value_info("inp1_add0_ct", TensorProto.FLOAT, [1]) + inp1_add = helper.make_tensor_value_info("inp1_add", TensorProto.FLOAT, shape) + inp1_add_ct = helper.make_tensor_value_info("inp1_add_ct", TensorProto.FLOAT, [1]) + inp2_add = helper.make_tensor_value_info("inp2_add", TensorProto.FLOAT, shape) + inp2_add_ct = helper.make_tensor_value_info("inp2_add_ct", TensorProto.FLOAT, [1]) + inp1_mul = helper.make_tensor_value_info("inp1_mul", TensorProto.FLOAT, shape) + inp1_mul_ct = helper.make_tensor_value_info("inp1_mul_ct", TensorProto.FLOAT, [1]) + inp2_mul = helper.make_tensor_value_info("inp2_mul", TensorProto.FLOAT, shape) + inp2_mul_ct = helper.make_tensor_value_info("inp2_mul_ct", TensorProto.FLOAT, [1]) + eltwise_add = helper.make_tensor_value_info("eltwise_add", TensorProto.FLOAT, shape) + pool = helper.make_tensor_value_info("pool", TensorProto.FLOAT, [1, ch, 1, 1]) + reshape_ct = helper.make_tensor_value_info("reshape_ct", TensorProto.INT64, [2]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]) + + add0_node = helper.make_node("Add", [inp.name, inp1_add0_ct.name], ["out_add0"]) + add1_node = helper.make_node("Add", ["out_add0", inp1_add_ct.name], [inp1_add.name]) + add2_node = helper.make_node("Add", ["out_add0", inp2_add_ct.name], [inp2_add.name]) + mul1_node = helper.make_node( + "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name] + ) + mul2_node = helper.make_node( + "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name] + ) + eltwise_add_node = helper.make_node( + "Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name] + ) + globalavgpool_node = helper.make_node( + "GlobalAveragePool", [eltwise_add.name], [pool.name] + ) + reshape_node = helper.make_node( + "Reshape", [pool.name, reshape_ct.name], [outp.name] + ) + + graph = helper.make_graph( + nodes=[ + add0_node, + add1_node, + add2_node, + mul1_node, + mul2_node, + eltwise_add_node, + globalavgpool_node, + reshape_node, + ], + name="graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="add-model") + model = ModelWrapper(model) + + # set initializers for scalar add/mul nodes + model.set_initializer(add0_node.input[1], np.array([0.0])) + model.set_initializer(add1_node.input[1], np.array([7.0])) + model.set_initializer(add2_node.input[1], np.array([8.0])) + model.set_initializer(mul1_node.input[1], np.array([2.0])) + model.set_initializer(mul2_node.input[1], np.array([2.0])) + model.set_initializer(reshape_node.input[1], np.array([1, -1])) + + return model + + +# data types +@pytest.mark.parametrize("idt", [DataType.UINT2]) +# channels +@pytest.mark.parametrize("ch", [16]) +# ifmdim +@pytest.mark.parametrize("ifmdim", [5]) +@pytest.mark.vivado +@pytest.mark.slow +def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): + model = make_model(ch, ifmdim) + model.save(export_onnx_path) + model = ModelWrapper(export_onnx_path) + model = model.transform(DoubleToSingleFloat()) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataLayouts()) + # model.save("golden.onnx") + # generate test vectors of correct shape + if ifmdim == -1: + input_tensor_shape = (1, ch) + else: + input_tensor_shape = (1, ch, ifmdim, ifmdim) + + x = gen_finn_dt_tensor(idt, input_tensor_shape) + + # generate expected value from streamlined net + input_dict = {model.graph.input[0].name: x} + + output_dict = oxe.execute_onnx(model, input_dict, True) + produced_sum = output_dict[model.graph.output[0].name] + chw_mul = model.get_initializer(model.graph.node[-1].input[1]) + chw_mul = 1 + expected_sum = chw_mul * np.sum(2 * (2 * x + 15.0), axis=(2, 3)) / (ifmdim * ifmdim) + assert (produced_sum.flatten() == expected_sum.flatten()).all() + + model = model.transform(InferDataLayouts()) + + # convert to hls + model.set_tensor_datatype(model.graph.input[0].name, idt) + # extra streamlining + model = model.transform(MoveScalarLinearPastInvariants()) + model = model.transform(MoveAddPastMul()) + model = model.transform(CollapseRepeatedMul()) + model = model.transform(CollapseRepeatedAdd()) + # insert top-k node, which should absorb linear ops before it + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + model = model.transform(InferDataTypes()) + + model = model.transform(to_hls.InferChannelwiseLinearLayer()) + model = model.transform(to_hls.InferAddStreamsLayer()) + model = model.transform(to_hls.InferGlobalAccPoolLayer()) + model = model.transform(MoveScalarLinearPastInvariants()) + model = model.transform(InsertTopK()) + model = model.transform(AbsorbScalarMulIntoTopK()) + model = model.transform(InferDataTypes()) + model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(AbsorbConsecutiveTransposes()) + model = model.transform(InferDataTypes()) + model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hls.InferDuplicateStreamsLayer()) + + model = model.transform(SortGraph()) + + # model.save("golden_hls.onnx") + # check topology status + + finn_nodes = model.get_finn_nodes() + assert len(finn_nodes) == 9 + add_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + assert len(add_nodes) == 1 + pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_Batch") + assert len(pool_nodes) == 1 + label_nodes = model.get_nodes_by_op_type("LabelSelect_Batch") + assert len(label_nodes) == 1 + channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_Batch") + assert len(channelwise_nodes) == 5 + dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_Batch") + assert len(dup_nodes) == 1 + + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + + output_dict = oxe.execute_onnx(model, input_dict, True) + produced_topk_hls = output_dict[model.graph.output[0].name] + topk_input = output_dict[model.graph.node[-1].input[0]] + assert soft_verify_topk(topk_input, produced_topk_hls, 5) + + os.remove(export_onnx_path) diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py index c9f78dcea1a1ce364d0657ad64de7d440d41b822..aba973051cb14e3e428e4de72a57924884c831de 100644 --- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py +++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py @@ -77,27 +77,63 @@ def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, i return model +def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt): + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim] + ) + + mp_node = helper.make_node( + "QuantAvgPool2d", + ["inp"], + ["outp"], + domain="finn", + stride=stride, + kernel=k, + ibits=idt.bitwidth(), + obits=odt.bitwidth(), + signed=1 if idt.signed() else 0, + data_layout="NCHW", + ) + graph = helper.make_graph( + nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="mp-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model = model.transform(InferShapes()) + + return model + + def prepare_inputs(input_tensor): return {"inp": input_tensor} # input datatype -@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4]) +@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4, DataType.INT8]) +# output datatype +@pytest.mark.parametrize("odt", [DataType.UINT4, DataType.INT4]) # pool configuration: ( k,stride, pad, ifm_dim ) -@pytest.mark.parametrize( - "pool_config", [(3, 2, 0, 5), (3, 2, 1, 5), (2, 2, 0, 8), (5, 2, 2, 7)] -) +@pytest.mark.parametrize("pool_config", [(7, 7, 0, 7), (3, 2, 1, 5)]) # input channels -@pytest.mark.parametrize("ifm_ch", [1, 4, 20]) +@pytest.mark.parametrize("ifm_ch", [1, 4]) # number of out channel computed in parallel -@pytest.mark.parametrize("pe", [1, 4, 20]) +@pytest.mark.parametrize("pe", [1, 2, 4]) +# pool type +@pytest.mark.parametrize("op_type", ["QuantAvgPool2d", "MaxPool"]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) -# pool type -@pytest.mark.parametrize("op_type", ["MaxPool"]) @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_type): +def test_convert_to_hls_pool_batch( + idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode +): k, stride, pad, ifm_dim = pool_config if ifm_ch % pe != 0: @@ -113,9 +149,25 @@ def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_t # prepare input data input_dict = prepare_inputs(x) if op_type == "MaxPool": + # if idt.signed(): + # pytest.skip("""No support for signed input (see accu initialization + # in Pool_batch HLSLIB function). Skipping""") + + if idt != odt: + pytest.skip("Skipping Maxpool with idt != odt") + model = make_single_maxpool_modelwrapper( k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt ) + elif op_type == "QuantAvgPool2d": + if pad != 0: + pytest.skip("No padding support for QuantAvgPool2d. Skipping") + + if idt.signed() != odt.signed(): + pytest.skip("Skipping QuantAvgPool2d with idt.signed() != odt.signed()") + model = make_single_quantavpool_modelwrapper( + k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt + ) else: assert False, "{} is not a supported op_type".format(op_type) @@ -151,7 +203,7 @@ def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_t # execute new_model y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] assert (y_produced == y_expected).all() - if stride != k: + if stride <= k: if pad == 0 or ifm_ch == pe: assert len(new_model.graph.node) == 4 else: diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index 4fb84be59333ef0e696204c9064fcf77e35b5d9b..59ac1c09f4fe338ef03a8166c63b9d4b29bbc08e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -33,6 +33,8 @@ from onnx import TensorProto, helper import finn.core.onnx_exec as oxe from finn.core.datatype import DataType from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -72,6 +74,9 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt): model.set_tensor_datatype("inp", idt) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py index fc5cdb7745945bee99564ba9ab19423a66d8e035..251fc806c3b0f8a52183b8003db6d930351b0ace 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py +++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py @@ -134,7 +134,7 @@ def prepare_inputs(input_tensor, idt, wdt): # mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4]) # weight datatype @@ -221,7 +221,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4]) # weight datatype @@ -329,7 +329,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) @pytest.mark.vivado -def test_fpgadataflow_fclayer_large_depth_decoupled_mode( +def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( mem_mode, idt, wdt, act, nf, sf, mw, mh ): if nf == -1: diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 9fcd78521e967ebed248e1873f92700673d484f2..c86ef8bf3e010f9ba21306a0308c8e992930a9b3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -50,13 +50,19 @@ from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp from finn.transformation.general import GiveUniqueNodeNames -from finn.util.basic import gen_finn_dt_tensor, pynq_part_map +from finn.util.basic import ( + gen_finn_dt_tensor, + pynq_part_map, + alveo_part_map, + alveo_default_platform, +) from finn.util.fpgadataflow import pyverilate_stitched_ip from finn.util.test import load_test_checkpoint_or_skip from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext from finn.transformation.infer_data_layouts import InferDataLayouts from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.floorplan import Floorplan +from finn.transformation.fpgadataflow.vitis_build import VitisBuild from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild @@ -120,7 +126,7 @@ def create_one_fc_model(): return model -def create_two_fc_model(): +def create_two_fc_model(mem_mode="decoupled"): # create a model with two StreamingFCLayer instances wdt = DataType.INT2 idt = DataType.INT32 @@ -153,7 +159,7 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode=mem_mode, ) fc1 = helper.make_node( @@ -173,7 +179,7 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode=mem_mode, ) graph = helper.make_graph( @@ -248,35 +254,35 @@ def test_fpgadataflow_ipstitch_rtlsim(): model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd") sim = pyverilate_stitched_ip(model) exp_io = [ - "ap_clk_0", - "ap_rst_n_0", - "in0_V_V_0_tdata", - "in0_V_V_0_tready", - "in0_V_V_0_tvalid", - "out_r_0_tdata", - "out_r_0_tkeep", - "out_r_0_tlast", - "out_r_0_tready", - "out_r_0_tvalid", - "s_axi_control_0_araddr", - "s_axi_control_0_arready", - "s_axi_control_0_arvalid", - "s_axi_control_0_awaddr", - "s_axi_control_0_awready", - "s_axi_control_0_awvalid", - "s_axi_control_0_bready", - "s_axi_control_0_bresp", - "s_axi_control_0_bvalid", - "s_axi_control_0_rdata", - "s_axi_control_0_rready", - "s_axi_control_0_rresp", - "s_axi_control_0_rvalid", - "s_axi_control_0_wdata", - "s_axi_control_0_wready", - "s_axi_control_0_wstrb", - "s_axi_control_0_wvalid", + "ap_clk", + "ap_rst_n", + "s_axis_0_tdata", + "s_axis_0_tready", + "s_axis_0_tvalid", + "m_axis_0_tdata", + "m_axis_0_tkeep", + "m_axis_0_tlast", + "m_axis_0_tready", + "m_axis_0_tvalid", + "s_axi_control_araddr", + "s_axi_control_arready", + "s_axi_control_arvalid", + "s_axi_control_awaddr", + "s_axi_control_awready", + "s_axi_control_awvalid", + "s_axi_control_bready", + "s_axi_control_bresp", + "s_axi_control_bvalid", + "s_axi_control_rdata", + "s_axi_control_rready", + "s_axi_control_rresp", + "s_axi_control_rvalid", + "s_axi_control_wdata", + "s_axi_control_wready", + "s_axi_control_wstrb", + "s_axi_control_wvalid", ] - assert dir(sim.io) == exp_io + assert sorted(dir(sim.io)) == sorted(exp_io) model.set_metadata_prop("exec_mode", "rtlsim") idt = model.get_tensor_datatype("inp") ishape = model.get_tensor_shape("inp") @@ -413,6 +419,28 @@ def test_fpgadataflow_ipstitch_iodma_floorplan(): model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_iodma_floorplan.onnx") +# board +@pytest.mark.parametrize("board", ["U250"]) +# clock period +@pytest.mark.parametrize("period_ns", [5]) +# override mem_mode to external +@pytest.mark.parametrize("extw", [True, False]) +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.vitis +def test_fpgadataflow_ipstitch_vitis(board, period_ns, extw): + platform = alveo_default_platform[board] + fpga_part = alveo_part_map[board] + model = create_two_fc_model("external" if extw else "decoupled") + if model.graph.node[0].op_type == "StreamingDataflowPartition": + sdp_node = getCustomOp(model.graph.node[0]) + assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" + assert os.path.isfile(sdp_node.get_nodeattr("model")) + model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) + model = model.transform(VitisBuild(fpga_part, period_ns, platform)) + model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx") + + # board @pytest.mark.parametrize("board", ["Pynq-Z1"]) @pytest.mark.slow diff --git a/tests/transformation/test_topk_insert.py b/tests/transformation/test_topk_insert.py index a18e63384150f140cb63ec7b438283eb4797266c..b85ed4aa6999faf751e535c1cc687d639c4eb74f 100644 --- a/tests/transformation/test_topk_insert.py +++ b/tests/transformation/test_topk_insert.py @@ -1,4 +1,4 @@ -import os +# import os import onnx from finn.util.test import get_test_model_trained import brevitas.onnx as bo @@ -57,4 +57,4 @@ def test_topk_insert(k): output_pysim_topk = output_pysim_topk.astype(np.int).flatten() assert np.array_equal(output_golden_topk, output_pysim_topk) - os.remove(export_onnx_path) + # os.remove(export_onnx_path)