Merge pull request #760 from Xilinx/dev

Release Merge for v0.9

Merge pull request #760 from Xilinx/dev
Release Merge for v0.9
cdc5ec4b · auphelia · GitHub · 41740ed1 · 17af0c35 · cdc5ec4b
Unverified Commit cdc5ec4b authored 2 years ago by auphelia Committed by GitHub 2 years ago
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
 name: DockerImage

 on:
+  pull_request:
+    branches: [ dev ]
  push:
-    branches:
-      - 'dev'
+    branches: [ dev ]

 jobs:
  docker:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
    steps:
      -
        name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1

--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -13,10 +13,12 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3

      - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'

      - name: Run Lint
-        uses: pre-commit/action@v2.0.0
+        uses: pre-commit/action@v3.0.0
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -11,43 +11,15 @@ jobs:

  test:
    name: Run quicktest on PR branch
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04

    steps:
      - name: checkout
-        uses: actions/checkout@v2
-
-      - name: set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
-      - name: cache Docker layers
-        uses: actions/cache@v2
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
-
-      - name: Build and push
-        uses: docker/build-push-action@v2
-        with:
-          file: docker/Dockerfile.finn
-          context: .
-          push: false
-          load: true
-          tags: finn_gha
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
-      -
-        # Temp fix
-        # https://github.com/docker/build-push-action/issues/252
-        # https://github.com/moby/buildkit/issues/1896
-        name: Move cache
-        run: |
-          rm -rf /tmp/.buildx-cache
-          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
-
+        uses: actions/checkout@v3

      - name: DockerRunQuicktest
        run: |
-          docker run --init --hostname finn_gha -w $(pwd) -v $(pwd):$(pwd) -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
+          export FINN_ROOT=$(pwd)
+          export FINN_BUILD_DIR=/tmp/finn_gha
+          export FINN_INST_NAME=finn_gha
+          ./run-docker.sh quicktest
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
    args: ['--fix=no']

 - repo: https://github.com/PyCQA/isort
-  rev: 5.10.1
+  rev: 5.12.0
  hooks:
  - id: isort

@@ -61,7 +61,7 @@ repos:
  - id: black
    language_version: python3

- repo: https://gitlab.com/pycqa/flake8
+- repo: https://github.com/PyCQA/flake8
  rev: 3.9.2
  hooks:
  - id: flake8

--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -35,7 +35,7 @@ sphinx:
   configuration: docs/finn/conf.py

 python:
-   version: 3.7
+   version: 3.8
   install:
    - method: pip
      path: .

--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -9,7 +9,7 @@ Contributors
 * Hendrik Borras (@HenniOVP)
 * Lucian Petrica (@quetric)
 * Tobias Alonso (@Tobi-Alonso)
-* Felix Paul Jentzsch (@felixpj)
+* Felix Paul Jentzsch (@fpjentzsch)
 * Mirza Mrahorovic (@mmrahorovic)
 * Suranga Mahesh (@surangamh)
 * Peter Lehnhardt (@pete-lennart)
@@ -26,3 +26,5 @@ Contributors
 * Aziz Bahri (@azizb-xlnx)
 * Fionn O'Donohoe (@fionnodonohoe-xlnx)
 * Matthias Gehre (@mgehre-amd)
+* Hugo Le Blevec (@hleblevec)
+* Patrick Geel (@patrickgeel)
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s

 ## Documentation

-You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/master/notebooks), which we recommend running from inside Docker for a better experience.
+You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience.

 ## Community

@@ -67,4 +67,4 @@ The current implementation of the framework is based on the following publicatio
 ## Old version

 We previously released an early-stage prototype of a toolflow that took in Caffe-HWGQ binarized network descriptions and produced dataflow architectures. You can find it in the [v0.1](https://github.com/Xilinx/finn/tree/v0.1) branch in this repository.
-Please be aware that this version is deprecated and unsupported, and the master branch does not share history with that branch so it should be treated as a separate repository for all purposes.
+Please be aware that this version is deprecated and unsupported, and the main branch does not share history with that branch so it should be treated as a separate repository for all purposes.
--- a/custom_hls/lookup.hpp
+++ b/custom_hls/lookup.hpp
@@ -26,14 +26,15 @@
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- *******************************************************************************/
+*******************************************************************************/
+#ifndef LOOKUP_HPP
+#define LOOKUP_HPP

 #include <ap_int.h>
 #include <hls_stream.h>

-#ifndef LOOKUP_HPP
-#define LOOKUP_HPP
+#include "utils.hpp"
+

 template <
    unsigned NumEmbeddings,
@@ -57,4 +58,50 @@ void StreamingLookup(
    }
 }

+/**
+ * Lookup implementation over a table stored in AXI-accessible memory.
+ */
+template <
+	unsigned  EmbeddingSize,                            // Number of memory words per embedding
+	unsigned  EmbeddingAlign = clog2(EmbeddingSize),    // Alignment of entries = number of word index bits
+	typename  T_SRC,
+	typename  T_DST
+>
+void StreamingLookup_ext(
+	hls::stream<T_SRC> &in0,
+	hls::stream<T_DST> &out,
+	T_DST const *const  mem,
+	unsigned  const     size,
+	unsigned           &oob_count,
+	bool               &oob_irq
+) {
+#pragma HLS pipeline II=EmbeddingSize+9 style=flp
+
+	static unsigned  oob_count_li;
+	static unsigned  oob_count_int;
+#pragma HLS reset variable=oob_count_li
+#pragma HLS reset variable=oob_count_int
+
+	if(oob_count != oob_count_li) {
+		oob_count_int -= oob_count_li;
+		oob_count_li   = oob_count;
+	}
+	if(!in0.empty()) {
+		T_SRC const  x = in0.read();
+
+		// Map out-of-bounds inputs to an offset of zero and increment counter
+		bool  const  oob = x >= T_SRC(size);
+		ap_uint<T_SRC::width+EmbeddingAlign> const  ofs =
+			((oob? T_SRC(0) : x), ap_uint<EmbeddingAlign>(0));
+		oob_count_int += oob;
+
+		// Stream lookup data (burst inferred)
+		for(unsigned  i = 0; i < EmbeddingSize; i++) {
+#pragma HLS pipeline II=1 style=flp
+			out.write(mem[ofs+i]);
+		}
+	}
+	oob_count =  oob_count_int;
+	oob_irq   = (oob_count_int != 0);
+}
 #endif
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -46,7 +46,6 @@ RUN apt-get update && \
    libsm6 \
    libxext6 \
    libxrender-dev \
-    verilator \
    nano \
    zsh \
    rsync \
@@ -62,6 +61,16 @@ RUN apt-get update && \
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 RUN locale-gen "en_US.UTF-8"

+# install Verilator from source to get the right version
+RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev
+RUN git clone https://github.com/verilator/verilator
+RUN cd verilator && \
+    git checkout v4.224 && \
+    autoconf && \
+    ./configure && \
+    make -j4 && \
+    make install
+
 # install XRT
 RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
 RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
@@ -75,7 +84,7 @@ RUN rm requirements.txt
 # extra Python package dependencies (for testing and interaction)
 RUN pip install pygments==2.4.1
 RUN pip install ipykernel==5.5.5
-RUN pip install jupyter==1.0.0
+RUN pip install jupyter==1.0.0 --ignore-installed
 RUN pip install markupsafe==2.0.1
 RUN pip install matplotlib==3.3.1 --ignore-installed
 RUN pip install pytest-dependency==0.5.1

--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -2,7 +2,7 @@

 : ${PYTEST_PARALLEL=auto}

-cd $FINN_ROOT/finn
+cd $FINN_ROOT
 # check if command line argument is empty or not present
 if [ -z $1 ]; then
  echo "Running quicktest: not (vivado or slow or board) with pytest-xdist"

--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -16,6 +16,6 @@ Two of the Brevitas-exported ONNX variants can be ingested by FINN:

 To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN.

-At this stage we can already use the functional verification flow to simulate the model using Python, this is marked in the graphic with the dotted arrow. For more details please have look at :ref:`verification`.
+At this stage we can already use the functional verification flow to simulate the model using Python. For more details please have look at :ref:`verification`.

 The model can now be further processed in FINN, the next flow step is :ref:`nw_prep`.
--- a/docs/finn/command_line.rst
+++ b/docs/finn/command_line.rst
@@ -105,7 +105,7 @@ The following outputs will be generated regardless of which particular outputs a
 The other output products are controlled by the `generate_outputs` field in the
 build configuration), and are detailed below.

-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.ESTIMATE_REPORTS` produces a variety of reports to estimate resource usage and performance *without* running any synthesis. This can be useful for setting up the parallelization and other hardware configuration:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.ESTIMATE_REPORTS` produces a variety of reports to estimate resource usage and performance *without* running any synthesis. This can be useful for setting up the parallelization and other hardware configuration:

  * ``report/estimate_layer_cycles.json`` -- cycles per layer estimation from analytical model
  * ``report/estimate_layer_resources.json`` -- resources per layer estimation from analytical model
@@ -113,31 +113,31 @@ build configuration), and are detailed below.
  * ``report/estimate_network_performance.json`` -- whole-network performance estimation from analytical model
  * ``report/op_and_param_counts.json`` -- per-layer and total number of operations and parameters (independent of parallelization)

-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.STITCHED_IP`: produces a stitched Vivado IP block design that can be integrated with other FPGA designs in Vivado IPI:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.STITCHED_IP`: produces a stitched Vivado IP block design that can be integrated with other FPGA designs in Vivado IPI:

  * ``stitched_ip/finn_vivado_stitch_proj.xpr`` -- Vivado project (including Vivado IP Integrator block design) to generate the stitched IP
  * ``stitched_ip/ip`` -- exported Vivado IP for the stitched design

-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.RTLSIM_PERFORMANCE`: measure latency and performance for the stitched IP in RTL simulation, using PyVerilator
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.RTLSIM_PERFORMANCE`: measure latency and performance for the stitched IP in RTL simulation, using PyVerilator

  * ``report/rtlsim_performance.json`` -- accelerator throughput and latency from RTL simulation

-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.OOC_SYNTH` runs out-of-context synthesis for the stitched IP. This is useful for getting post-synthesis resource counts and achievable clock frequency without having to produce a full bitfile with DMA engines:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.OOC_SYNTH` runs out-of-context synthesis for the stitched IP. This is useful for getting post-synthesis resource counts and achievable clock frequency without having to produce a full bitfile with DMA engines:

  * ``report/ooc_synth_and_timing.json`` -- resources and achievable clock frequency from out-of-context synthesis

-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.BITFILE` will run Vivado and/or Vitis to insert the FINN accelerator inside a shell, with DMA engines instantiated to move data to/from main memory:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.BITFILE` will run Vivado and/or Vitis to insert the FINN accelerator inside a shell, with DMA engines instantiated to move data to/from main memory:

  * ``bitfile/finn-accel.(bit|xclbin)`` -- generated bitfile depending on platform
  * ``report/post_synth_resources.xml`` -- FPGA resource utilization after synthesis
  * ``report/post_route_timing.rpt`` -- post-route timing report


-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.PYNQ_DRIVER` will generate a PYNQ Python driver that can be used to interface the generated accelerator:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.PYNQ_DRIVER` will generate a PYNQ Python driver that can be used to interface the generated accelerator:

  * ``driver/driver.py`` -- Python driver that can be used on PYNQ on Zynq or Alveo platforms to launch the accelerator

-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.DEPLOYMENT_PACKAGE`:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.DEPLOYMENT_PACKAGE`:

  * ``deploy/`` -- deployment package folder with a bitfile and driver, ready to be copied to target hardware platform

@@ -153,7 +153,7 @@ and compare it against the expected output that you provide.

 This is achieved by setting up the following members of the build configuration:

-* Set ``verify_steps`` to be a list of :py:mod:`finn.builder.build_dataflow.VerificationStepType`
+* Set ``verify_steps`` to be a list of :py:mod:`finn.builder.build_dataflow_config.VerificationStepType`
  where each element in the list indicates the output of a particular step
  that will be verified. See the documentation of the ``VerificationStepType``
  for more information.

--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -12,7 +12,7 @@ Prerequisites

 Before starting to do development on FINN it's a good idea to start
 with understanding the basics as a user. Going through all of the
-:ref:`tutorials` is strongly recommended if you haven' already done so.
+:ref:`tutorials` is strongly recommended if you haven't already done so.
 Additionally, please review the documentation available on :ref:`internals`.

 Repository structure
@@ -153,7 +153,7 @@ from the FINN root directory as follows:

 ::

-  python setup.py test --addopts "-k test_brevitas_debug --pdb"
+  pytest -k test_brevitas_debug --pdb


 If you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)

--- a/docs/finn/end_to_end_flow.rst
+++ b/docs/finn/end_to_end_flow.rst
@@ -9,7 +9,7 @@ As you can see in the picture, FINN has a high modularity and has the property t
   :scale: 50%
   :align: center

-The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS and Vivado IPI (orange section). There is also a section for testing and verification in software (red section) and the hardware generation and deployment on the PYNQ board (yellow section).
+The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export, followed by the preparation of the network for the Vitis HLS and Vivado IPI. There is also a section for testing and verification in software (in the cloud on the right) and the hardware generation and deployment on the PYNQ board.

 This example flow is covered in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
 For a more detailed overview about the different flow sections, please have a look at the corresponding pages:

--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -20,7 +20,7 @@ How do I use FINN?
 ==================

 We strongly recommend that you first watch one of the pre-recorded `FINN tutorial <https://www.youtube.com/watch?v=zw2aG4PhzmA&amp%3Bindex=2>`_
-videos, then follow the Jupyter notebook tutorials for `training and deploying an MLP for network intrusion detection <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/cybersecurity>`_ .
+videos, then follow the Jupyter notebook tutorials for `training and deploying an MLP for network intrusion detection <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/cybersecurity>`_ .
 You may also want to check out the other :ref:`tutorials`, and the `FINN examples repository <https://github.com/Xilinx/finn-examples>`_ .

 Our aim in FINN is *not* to accelerate common off-the-shelf neural networks, but instead provide you with a set of tools
@@ -28,19 +28,19 @@ to train *customized* networks and create highly-efficient FPGA implementations
 In general, the approach for using the FINN framework is as follows:

 1. Train your own quantized neural network (QNN) in `Brevitas <https://github.com/Xilinx/brevitas>`_. We have some `guidelines <https://bit.ly/finn-hls4ml-qat-guidelines>`_ on quantization-aware training (QAT).
-2. Export to FINN-ONNX by following `this tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/basics/1_brevitas_network_import.ipynb>`_ .
-3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb>`_
+2. Export to FINN-ONNX by following `this tutorial <https://github.com/Xilinx/finn/blob/main/notebooks/basics/1_brevitas_network_import.ipynb>`_ .
+3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial <https://github.com/Xilinx/finn/blob/main/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb>`_
 4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results.

 Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide.
 If there are substantial differences, you will most likely have to write your own
 Python scripts that call the appropriate FINN compiler
 functions that process your design correctly, or adding new functions (including
-Vivado HLS layers)
+Vitis HLS layers)
 as required.
-The `advanced FINN tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/advanced>`_ can be useful here.
+The `advanced FINN tutorials <https://github.com/Xilinx/finn/tree/main/notebooks/advanced>`_ can be useful here.
 For custom networks, we recommend making a copy of the `BNN-PYNQ end-to-end
-Jupyter notebook tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq>`_ as a starting point, visualizing the model at intermediate
+Jupyter notebook tutorials <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/bnn-pynq>`_ as a starting point, visualizing the model at intermediate
 steps and adding calls to new transformations as needed.
 Once you have a working flow, you can implement a command line entry for this
 by using the "advanced mode" described in the :ref:`command_line` section.
@@ -50,7 +50,8 @@ Running FINN in Docker
 FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
 You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well.
 If you want to use prebuilt images, read :ref:`Using a prebuilt image`.
-The ``run-docker.sh`` script that can be launched in the following modes:
+
+The above mentioned script to build and launch the FINN docker container is called `run-docker.sh <https://github.com/Xilinx/finn/blob/main/run-docker.sh>`_ . It can be launched in the following modes:

 Launch interactive shell
 ************************
@@ -140,10 +141,7 @@ If you are having trouble building the Docker image or need offline access, you

 Supported FPGA Hardware
 =======================
-**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
-
-.. warning::
-  In previous FINN versions (v0.4b - v0.7) we had support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis 2020.1, see instructions below for Alveo setup that works with older versions. Please note that with the new release with Vitis 2022.1, we do only have experimental support to automatically deployment for Alveo cards.
+**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards.

 **Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.

@@ -181,12 +179,12 @@ On the target side:

 On the host side:

-1. Install Vitis 2020.1 and set up the ``VITIS_PATH`` environment variable to point to your installation.
+1. Install Vitis 2022.1 and set up the ``VITIS_PATH`` environment variable to point to your installation.
 2. Install Xilinx XRT. Ensure that the ``XRT_DEB_VERSION`` environment variable reflects which version of XRT you have installed.
 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)*
 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above.
 5. `Set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
-6. Done! You can try the ``test_end2end_vitis`` tests in the FINN Docker to verify your setup, although this will take some time.
+6. Done!

 Vivado/Vitis license
 *********************
@@ -214,7 +212,7 @@ We also recommend running the FINN compiler on a system with sufficiently
 strong hardware:

 * **RAM.** Depending on your target FPGA platform, your system must have sufficient RAM to be
-  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/memory.html>`_
+  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/vivado-ml.html#memory>`_
  for more information. For targeting Zynq and Zynq UltraScale+ parts, at least 8 GB is recommended. Larger parts may require up to 16 GB.
  For targeting Alveo parts with Vitis, at least 64 GB RAM is recommended.


--- a/docs/finn/hw_build.rst
+++ b/docs/finn/hw_build.rst
@@ -9,14 +9,14 @@ Hardware Build and Deployment
   :align: center

 A model where all layers have been converted to HLS layers can be processed by
-FINN to build a bitfile and driver targeting a Zynq system or to generate a Vivado IP Integrator (IPI)
+FINN to build a bitfile and driver targeting a Zynq or Alveo system or to generate a Vivado IP Integrator (IPI)
 design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system.


 Hardware Build
 ==============

-Internally, the hardware build for Zynq devices consists of the following steps:
+Internally, the hardware build consists of the following steps:

 1. Driver generation
 2. DMA and DWC node insertion
@@ -89,9 +89,4 @@ Deployment
 Deployment and Remote Execution
 -------------------------------

-The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there using the *onnx_exec* function with the right *exec_mode* settings. For details please have a look at transformation :py:mod:`finn.transformation.fpgadataflow.make_deployment.DeployToPYNQ` and the execution function :py:mod:`finn.core.onnx_exec`.
-
-Throughput Test
---------------
-
-FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done by using :py:mod:`finn.core.throughput_test`. When running this function the metrics of the network are returned as dictionary.
+The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there. For more information see the description in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -7,7 +7,7 @@ Internals
 Intermediate Representation: QONNX and FINN-ONNX
 ================================================

-FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representation (IR) for neural networks. As such, almost every component inside FINN uses ONNX and its `Python API <https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md>`_, so you may want to familiarize yourself with how ONNX represents DNNs. Specifically, the `ONNX protobuf description <https://github.com/onnx/onnx/blob/master/onnx/onnx.proto>`_ (or its `human-readable documentation <https://github.com/onnx/onnx/blob/master/docs/IR.md>`_ and the `operator schemas <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_ are useful as reference documents. We also provide a Jupyter notebook that can help to get familiar with ONNX by showing how to work with a simple ONNX model in FINN, see chapter :ref:`tutorials` for details.
+FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representation (IR) for neural networks. As such, almost every component inside FINN uses ONNX and its `Python API <https://github.com/onnx/onnx/blob/main/docs/PythonAPIOverview.md>`_, so you may want to familiarize yourself with how ONNX represents DNNs. Specifically, the `ONNX protobuf description <https://github.com/onnx/onnx/blob/main/onnx/onnx.proto>`_ (or its `human-readable documentation <https://github.com/onnx/onnx/blob/main/docs/IR.md>`_ and the `operator schemas <https://github.com/onnx/onnx/blob/main/docs/Operators.md>`_ are useful as reference documents. We also provide a Jupyter notebook that can help to get familiar with ONNX by showing how to work with a simple ONNX model in FINN, see chapter :ref:`tutorials` for details.

 .. note:: FINN supports two specialized variants of ONNX called QONNX and FINN-ONNX, and not all ONNX graphs are supported by FINN (and vice versa).

@@ -137,14 +137,14 @@ ModelWrapper contains more useful functions, if you are interested please have a
 Analysis Pass
 =============

-An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis`.
+An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis` .

 .. _transformation_pass:

 Transformation Pass
 ===================

-A transformation passes changes (transforms) the given model, it gets the model in the ModelWrapper as input and returns the changed model (ModelWrapper) to the FINN flow. Additional the flag *model_was_changed* which indicates if a transformation has to be performed more than once, is returned. If you are interested in how to write a transformation pass for FINN, please take a look at the Jupyter notebook about how to write a transformation pass, see chapter :ref:`tutorials` for details. For more information about existing transformation passes in FINN, see module :py:mod:`finn.transformation`.
+A transformation passes changes (transforms) the given model, it gets the model in the ModelWrapper as input and returns the changed model (ModelWrapper) to the FINN flow. Additional the flag *model_was_changed* which indicates if a transformation has to be performed more than once, is returned. If you are interested in how to write a transformation pass for FINN, please take a look at the Jupyter notebook about how to write a transformation pass, see chapter :ref:`tutorials` for details. For more information about existing transformation passes in FINN, see module :py:mod:`finn.transformation` .

 .. _mem_mode:

@@ -167,7 +167,7 @@ The following picture shows the idea behind the "const" and "decoupled" mode.

 Const mode
 ----------
-In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function <https://github.com/Xilinx/finn-hlslib/blob/19fa1197c09bca24a0f77a7fa04b8d7cb5cc1c1d/mvau.hpp#L93>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
+In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function <https://github.com/Xilinx/finn-hlslib/blob/master/mvau.hpp#L92>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.

 Advantages:

@@ -185,7 +185,7 @@ Disadvantages:

 Decoupled mode
 --------------
-In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU <https://github.com/Xilinx/finn-hlslib/blob/07a8353f6cdfd8bcdd81e309a5581044c2a93d3b/mvau.hpp#L213>`_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib <https://github.com/Xilinx/finn/tree/dev/finn-rtllib>`_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode.
+In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU <https://github.com/Xilinx/finn-hlslib/blob/master/mvau.hpp#L214>`_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib <https://github.com/Xilinx/finn/tree/dev/finn-rtllib>`_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode.

 Advantages:


--- a/docs/finn/nw_prep.rst
+++ b/docs/finn/nw_prep.rst
@@ -10,7 +10,7 @@ Network Preparation

 The main principle of FINN are analysis and transformation passes. If you like to have more information about these please have a look at section :ref:`analysis_pass` and :ref:`transformation_pass` or at chapter :ref:`tutorials` about the provided Jupyter notebooks.

-This page is about the network preparation, the flow step that comes after the :ref:`brevitas_export`. Its main idea is to optimize the network and convert the nodes to custom nodes that correspond to `finn-hlslib <https://github.com/Xilinx/finn-hlslib>`_ functions. In this way we get a network that we can bring to hardware with the help of Vivado. For that we have to apply several transformations on the ONNX model, which this flow step receives wrapped in the :ref:`modelwrapper`.
+This page is about the network preparation, the flow step that comes after the :ref:`brevitas_export`. Its main idea is to optimize the network and convert the nodes to custom nodes that correspond to `finn-hlslib <https://github.com/Xilinx/finn-hlslib>`_ functions. In this way we get a network that we can bring to hardware with the help of Vitis and Vivado. For that we have to apply several transformations on the ONNX model, which this flow step receives wrapped in the :ref:`modelwrapper`.

 Various transformations are involved in the network preparation. The following is a short overview of these.


--- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
@@ -30,6 +30,7 @@ finn.analysis.fpgadataflow.floorplan\_params
   :undoc-members:
   :show-inheritance:

+
 finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
 -------------------------------------------------------------

@@ -38,14 +39,15 @@ finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
   :undoc-members:
   :show-inheritance:

- finn.analysis.fpgadataflow.op\_and\_param\_counts
- --------------------------------------------------
+finn.analysis.fpgadataflow.op\_and\_param\_counts
+--------------------------------------------------

- .. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
+.. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
    :members:
    :undoc-members:
    :show-inheritance:

+
 finn.analysis.fpgadataflow.post\_synth\_res
 --------------------------------------------------

@@ -54,6 +56,7 @@ finn.analysis.fpgadataflow.post\_synth\_res
   :undoc-members:
   :show-inheritance:

+
 finn.analysis.fpgadataflow.res\_estimation
 -------------------------------------------------


--- a/docs/finn/source_code/finn.builder.rst
+++ b/docs/finn/source_code/finn.builder.rst
@@ -9,9 +9,9 @@ finn.builder.build\_dataflow
 ----------------------------

 .. automodule:: finn.builder.build_dataflow
-   :members:
-   :undoc-members:
-   :show-inheritance:
+ :members:
+ :undoc-members:
+ :show-inheritance:

 finn.builder.build\_dataflow\_config
 ------------------------------------
@@ -26,6 +26,6 @@ finn.builder.build\_dataflow\_steps
 ------------------------------------

 .. automodule:: finn.builder.build_dataflow_steps
-  :members:
-  :undoc-members:
-  :show-inheritance:
+ :members:
+ :undoc-members:
+ :show-inheritance: