diff --git a/.gitignore b/.gitignore index 0c1bbd84fe24be46446a7d714dd708d601813e53..29546af82ada311ad6307a5028d6f173ba2ffeb1 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,7 @@ __pycache__/* .cache/* .*.swp -*/.ipynb_checkpoints/* +*.ipynb_checkpoints* # Project files .ropeproject @@ -88,6 +88,10 @@ MANIFEST # datasets for testing /dataset/ /data/ +*.csv # Google Drive key for dashboard /gdrive-key/ + +# generated files as part of end2end notebooks +/notebooks/end2end_example/**/*.onnx diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 76d522a0e827cce94383afaab61413aaa0752561..ee8d88789e16a6fde77850e742258dfc089c8659 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -12,7 +12,7 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -FINN_BASE_COMMIT=efcc0324fbca2476af25f7d3c060d51d5270f09a +FINN_BASE_COMMIT=1363981654009067790d5f2d0c3dd303b5fa05cb BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e diff --git a/notebooks/end2end_example/StreamingDataflowPartition_1.pdf b/notebooks/end2end_example/bnn-pynq/StreamingDataflowPartition_1.pdf similarity index 100% rename from notebooks/end2end_example/StreamingDataflowPartition_1.pdf rename to notebooks/end2end_example/bnn-pynq/StreamingDataflowPartition_1.pdf diff --git a/notebooks/end2end_example/cnv-mp-fc.png b/notebooks/end2end_example/bnn-pynq/cnv-mp-fc.png similarity index 100% rename from notebooks/end2end_example/cnv-mp-fc.png rename to notebooks/end2end_example/bnn-pynq/cnv-mp-fc.png diff --git a/notebooks/end2end_example/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb similarity index 100% rename from notebooks/end2end_example/cnv_end2end_example.ipynb rename to notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb diff --git a/notebooks/end2end_example/finn-design-flow-example.svg b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg similarity index 100% rename from notebooks/end2end_example/finn-design-flow-example.svg rename to notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg diff --git a/notebooks/end2end_example/finn-hw-arch.png b/notebooks/end2end_example/bnn-pynq/finn-hw-arch.png similarity index 100% rename from notebooks/end2end_example/finn-hw-arch.png rename to notebooks/end2end_example/bnn-pynq/finn-hw-arch.png diff --git a/notebooks/end2end_example/pynq_shell_project.png b/notebooks/end2end_example/bnn-pynq/pynq_shell_project.png similarity index 100% rename from notebooks/end2end_example/pynq_shell_project.png rename to notebooks/end2end_example/bnn-pynq/pynq_shell_project.png diff --git a/notebooks/end2end_example/stitched_ip.png b/notebooks/end2end_example/bnn-pynq/stitched_ip.png similarity index 100% rename from notebooks/end2end_example/stitched_ip.png rename to notebooks/end2end_example/bnn-pynq/stitched_ip.png diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb similarity index 100% rename from notebooks/end2end_example/tfc_end2end_example.ipynb rename to notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb diff --git a/notebooks/end2end_example/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb similarity index 100% rename from notebooks/end2end_example/tfc_end2end_verification.ipynb rename to notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb diff --git a/notebooks/end2end_example/top.pdf b/notebooks/end2end_example/bnn-pynq/top.pdf similarity index 100% rename from notebooks/end2end_example/top.pdf rename to notebooks/end2end_example/bnn-pynq/top.pdf diff --git a/notebooks/end2end_example/verification.png b/notebooks/end2end_example/bnn-pynq/verification.png similarity index 100% rename from notebooks/end2end_example/verification.png rename to notebooks/end2end_example/bnn-pynq/verification.png diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..91a776f84e9554579d97447c9ca0889da5c29e48 --- /dev/null +++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb @@ -0,0 +1,773 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train a Quantized MLP on UNSW-NB15 with Brevitas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we will show how to create, train and export a quantized Multi Layer Perceptron (MLP) with quantized weights and activations with [Brevitas](https://github.com/Xilinx/brevitas).\n", + "Specifically, the task at hand will be to label network packets as normal or suspicious (e.g. originating from an attacker, virus, malware or otherwise) by training on a quantized variant of the UNSW-NB15 dataset. \n", + "\n", + "**You won't need a GPU to train the neural net.** This MLP will be small enough to train on a modern x86 CPU, so no GPU is required to follow this tutorial Alternatively, we provide pre-trained parameters for the MLP if you want to skip the training entirely.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A quick introduction to the task and the dataset\n", + "\n", + "*The task:* The goal of [*network intrusion detection*](https://ieeexplore.ieee.org/abstract/document/283931) is to identify, preferably in real time, unauthorized use, misuse, and abuse of computer systems by both system insiders and external penetrators. This may be achieved by a mix of techniques, and machine-learning (ML) based techniques are increasing in popularity. \n", + "\n", + "*The dataset:* Several datasets are available for use in ML-based methods for intrusion detection.\n", + "The [UNSW-NB15](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/) is one such dataset created by the Australian Centre for Cyber Security (ACCS) to provide a comprehensive network based data set which can reflect modern network traffic scenarios. You can find more details about the dataset on [its homepage](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/).\n", + "\n", + "*Performance considerations:* FPGAs are commonly used for implementing high-performance packet processing systems that still provide a degree of programmability. To avoid introducing bottlenecks on the network, the DNN implementation must be capable of detecting malicious ones at line rate, which can be millions of packets per second, and is expected to increase further as next-generation networking solutions provide increased\n", + "throughput. This is a good reason to consider FPGA acceleration for this particular use-case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "-------------\n", + "\n", + "* [Initial setup](#initial_setup)\n", + "* [Define the Quantized MLP model](#define_quantized_mlp)\n", + "* [Load the UNSW_NB15 dataset](#load_dataset) \n", + "* [Define Train and Test Methods](#train_test)\n", + "* [(Option 1) Train the Model from Scratch](#train_scratch)\n", + "* [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n", + "* [Network Surgery Before Export](#network_surgery)\n", + "* [Export to FINN-ONNX](#export_finn_onnx)\n", + "* [View the Exported ONNX in Netron](#view_in_netron)\n", + "* [That's it!](#thats_it)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initial Setup <a id='initial_setup'></a>\n", + "\n", + "Let's start by making sure we have all the Python packages we'll need for this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas in /workspace/.local/lib/python3.6/site-packages (1.1.5)\n", + "Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.6/site-packages (from pandas) (2019.1)\n", + "Requirement already satisfied: numpy>=1.15.4 in /opt/conda/lib/python3.6/site-packages (from pandas) (1.19.4)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.6/site-packages (from pandas) (2.8.1)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", + "Requirement already satisfied: scikit-learn in /workspace/.local/lib/python3.6/site-packages (0.23.2)\n", + "Requirement already satisfied: scipy>=0.19.1 in /opt/conda/lib/python3.6/site-packages (from scikit-learn) (1.5.2)\n", + "Requirement already satisfied: joblib>=0.11 in /workspace/.local/lib/python3.6/site-packages (from scikit-learn) (1.0.0)\n", + "Requirement already satisfied: numpy>=1.13.3 in /opt/conda/lib/python3.6/site-packages (from scikit-learn) (1.19.4)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /workspace/.local/lib/python3.6/site-packages (from scikit-learn) (2.1.0)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.6/site-packages (4.31.1)\n" + ] + } + ], + "source": [ + "!pip install --user pandas\n", + "!pip install --user scikit-learn\n", + "!pip install --user tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "import torch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**This is important -- always import onnx before torch**. This is a workaround for a [known bug](https://github.com/onnx/onnx/issues/2394)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the Quantized MLP Model <a id='define_quantized_mlp'></a>\n", + "\n", + "We'll now define an MLP model that will be trained to perform inference with quantized weights and activations.\n", + "For this, we'll use the quantization-aware training (QAT) capabilities offered by[Brevitas](https://github.com/Xilinx/brevitas).\n", + "\n", + "Our MLP will have four fully-connected (FC) layers in total: three hidden layers with 64 neurons, and a final output layer with a single output, all using 2-bit weights. We'll use 2-bit quantized ReLU activation functions, and apply batch normalization between each FC layer and its activation.\n", + "\n", + "In case you'd like to experiment with different quantization settings or topology parameters, we'll define all these topology settings as variables." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "input_size = 593 \n", + "hidden1 = 64 \n", + "hidden2 = 64\n", + "hidden3 = 64\n", + "weight_bit_width = 2\n", + "act_bit_width = 2\n", + "num_classes = 1 " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can define our MLP using the layer primitives provided by Brevitas:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from brevitas.nn import QuantLinear, QuantReLU\n", + "import torch.nn as nn\n", + "\n", + "model = nn.Sequential(\n", + " QuantLinear(input_size, hidden1, bias=True, weight_bit_width=weight_bit_width),\n", + " nn.BatchNorm1d(hidden1),\n", + " nn.Dropout(0.5),\n", + " QuantReLU(bit_width=act_bit_width),\n", + " QuantLinear(hidden1, hidden2, bias=True, weight_bit_width=weight_bit_width),\n", + " nn.BatchNorm1d(hidden2),\n", + " nn.Dropout(0.5),\n", + " QuantReLU(bit_width=act_bit_width),\n", + " QuantLinear(hidden2, hidden3, bias=True, weight_bit_width=weight_bit_width),\n", + " nn.BatchNorm1d(hidden3),\n", + " nn.Dropout(0.5),\n", + " QuantReLU(bit_width=act_bit_width),\n", + " QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width)\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the MLP's output is not yet quantized. Even though we want the final output of our MLP to be a binary (0/1) value indicating the classification, we've only defined a single-neuron FC layer as the output. While training the network we'll pass that output through a sigmoid function as part of the loss criterion, which [gives better numerical stability](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html). Later on, after we're done training the network, we'll add a quantization node at the end before we export it to FINN." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the UNSW_NB15 Dataset <a id='load_dataset'></a>\n", + "\n", + "### Dataset Quantization <a id='dataset_qnt'></a>\n", + "\n", + "The goal of this notebook is to train a Quantized Neural Network (QNN) to be later deployed as an FPGA accelerator generated by the FINN compiler. Although we can choose a variety of different precisions for the input, [Murovic and Trost](https://ev.fe.uni-lj.si/1-2-2019/Murovic.pdf) have previously shown we can actually binarize the inputs and still get good (90%+) accuracy.\n", + "Thus, we will create a binarized representation for the dataset by following the procedure defined by [Murovic and Trost](https://ev.fe.uni-lj.si/1-2-2019/Murovic.pdf), which we repeat briefly here:\n", + "\n", + "* Original features have different formats ranging from integers, floating numbers to strings.\n", + "* Integers, which for example represent a packet lifetime, are binarized with as many bits as to include the maximum value. \n", + "* Another case is with features formatted as strings (protocols), which are binarized by simply counting the number of all different strings for each feature and coding them in the appropriate number of bits.\n", + "* Floating-point numbers are reformatted into fixed-point representation.\n", + "* In the end, each sample is transformed into a 593-bit wide binary vector. \n", + "* All vectors are labeled as bad (0) or normal (1)\n", + "\n", + "Following their open-source implementation provided as a Matlab script [here](https://github.com/TadejMurovic/BNN_Deployment/blob/master/cybersecurity_dataset_unswb15.m), we've created a [Python version](dataloader_quantized.py).\n", + "This `UNSW_NB15_quantized` class implements a PyTorch `DataLoader`, which represents a Python iterable over a dataset. This is useful because enables access to data in batches." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the training and test set from the [official website](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/) - uncomment the following lines to download:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#! wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_training-set.csv\n", + "#! wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_testing-set.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import DataLoader, Dataset\n", + "from dataloader_quantized import UNSW_NB15_quantized\n", + "\n", + "file_path_train = \"UNSW_NB15_training-set.csv\"\n", + "file_path_test = \"UNSW_NB15_testing-set.csv\"\n", + "\n", + "train_quantized_dataset = UNSW_NB15_quantized(file_path_train = file_path_train, \\\n", + " file_path_test = file_path_test, \\\n", + " train=True)\n", + "\n", + "test_quantized_dataset = UNSW_NB15_quantized(file_path_train = file_path_train, \\\n", + " file_path_test = file_path_test, \\\n", + " train=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 1000\n", + "\n", + "# dataset loaders\n", + "train_quantized_loader = DataLoader(train_quantized_dataset, batch_size=batch_size, shuffle=True)\n", + "test_quantized_loader = DataLoader(test_quantized_dataset, batch_size=batch_size, shuffle=True) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Train and Test Methods <a id='train_test'></a>\n", + "The train and test methods will use a `DataLoader`, which feeds the model with a new predefined batch of training data in each iteration, until the entire training data is fed to the model. Each repetition of this process is called an `epoch`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def train(model, train_loader, optimizer, criterion):\n", + " losses = []\n", + " # ensure model is in training mode\n", + " model.train() \n", + " \n", + " for i, data in enumerate(train_loader, 0): \n", + " inputs, target = data\n", + " optimizer.zero_grad() \n", + " \n", + " # forward pass\n", + " output = model(inputs.float())\n", + " loss = criterion(output, target.unsqueeze(1))\n", + " \n", + " # backward pass + run optimizer to update weights\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " # keep track of loss value\n", + " losses.append(loss.data.numpy()) \n", + " \n", + " return losses" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "def test(model, test_loader): \n", + " # ensure model is in eval mode\n", + " model.eval() \n", + " y_true = []\n", + " y_pred = []\n", + " \n", + " with torch.no_grad():\n", + " for data in test_loader:\n", + " inputs, target = data\n", + " output_orig = model(inputs.float())\n", + " # run the output through sigmoid\n", + " output = torch.sigmoid(output_orig) \n", + " # compare against a threshold of 0.5 to generate 0/1\n", + " pred = (output.detach().numpy() > 0.5) * 1\n", + " target = target.float()\n", + " y_true.extend(target.tolist()) \n", + " y_pred.extend(pred.reshape(-1).tolist())\n", + " \n", + " return accuracy_score(y_true, y_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (Option 1) Train the Model from Scratch <a id=\"train_scratch\"></a>\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before we start training our MLP we need to define some hyperparameters. Moreover, in order to monitor the loss function evolution over epochs, we need to define a method for it. As mentioned earlier, we'll use a loss criterion which applies a sigmoid function during the training phase (`BCEWithLogitsLoss`). For the testing phase, we're manually computing the sigmoid and thresholding at 0.5." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "num_epochs = 5\n", + "lr = 0.001 \n", + "\n", + "def display_loss_plot(losses, title=\"Training loss\", xlabel=\"Iterations\", ylabel=\"Loss\"):\n", + " x_axis = [i for i in range(len(losses))]\n", + " plt.plot(x_axis,losses)\n", + " plt.title(title)\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# loss criterion and optimizer\n", + "criterion = nn.BCEWithLogitsLoss()\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.metrics import accuracy_score\n", + "from tqdm import tqdm, trange\n", + "\n", + "running_loss = []\n", + "running_test_acc = []\n", + "t = trange(num_epochs, desc=\"Training loss\", leave=True)\n", + "\n", + "for epoch in t:\n", + " loss_epoch = train(model, train_quantized_loader, optimizer,criterion)\n", + " test_acc = test(model, test_quantized_loader)\n", + " t.set_description(\"Training loss = %f test accuracy = %f\" % (np.mean(loss_epoch), test_acc))\n", + " t.refresh() # to show immediately the update \n", + " running_loss.append(loss_epoch)\n", + " running_test_acc.append(test_acc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "loss_per_epoch = [np.mean(loss_per_epoch) for loss_per_epoch in running_loss]\n", + "display_loss_plot(loss_per_epoch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test(model, test_quantized_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (Option 2) Load Pre-Trained Parameters <a id=\"load_pretrained\"></a>\n", + "\n", + "Instead of training from scratch, you can also use pre-trained parameters we provide here. These parameters should achieve ~91.9% test accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "IncompatibleKeys(missing_keys=[], unexpected_keys=[])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "\n", + "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n", + "\n", + "model.load_state_dict(trained_state_dict, strict=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9188772287810328" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test(model, test_quantized_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Network Surgery Before Export <a id=\"network_surgery\"></a>\n", + "\n", + "Sometimes, it's desirable to make some changes to our trained network prior to export (this is known in general as \"network surgery\"). This depends on the model and is not generally necessary, but in this case we want to make a couple of changes to get better results with FINN." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start by padding the input. Our input vectors are 593-bit, which will make folding (parallelization) for the first layer a bit tricky since 593 is a prime number. So we'll pad the weight matrix of the first layer with seven 0-valued columns to work with an input size of 600 instead. When using the modified network we'll similarly provide inputs padded to 600 bits." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(64, 593)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from copy import deepcopy\n", + "\n", + "modified_model = deepcopy(model)\n", + "\n", + "W_orig = modified_model[0].weight.data.detach().numpy()\n", + "W_orig.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(64, 600)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# pad the second (593-sized) dimensions with 7 zeroes at the end\n", + "W_new = np.pad(W_orig, [(0,0), (0,7)])\n", + "W_new.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([64, 600])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modified_model[0].weight.data = torch.from_numpy(W_new)\n", + "modified_model[0].weight.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll modify the expected input/output ranges. In FINN, we prefer to work with bipolar {-1, +1} instead of binary {0, 1} values. To achieve this, we'll create a \"wrapper\" model that handles the pre/postprocessing as follows:\n", + "\n", + "* on the input side, we'll pre-process by (x + 1) / 2 in order to map incoming {-1, +1} inputs to {0, 1} ones which the trained network is used to. Since we're just multiplying/adding a scalar, these operations can be *streamlined* in FINN and implemented with no extra cost.\n", + "\n", + "* on the output side, we'll add a binary quantizer which maps everthing below 0 to -1 and everything above 0 to +1. This is essentially the same behavior as the sigmoid we used earlier, except the outputs are bipolar instead of binary." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "from brevitas.core.quant import QuantType\n", + "from brevitas.nn import QuantIdentity\n", + "\n", + "\n", + "class CybSecMLPForExport(nn.Module):\n", + " def __init__(self, my_pretrained_model):\n", + " super(CybSecMLPForExport, self).__init__()\n", + " self.pretrained = my_pretrained_model\n", + " self.qnt_output = QuantIdentity(quant_type=QuantType.BINARY, bit_width=1, min_val=-1.0, max_val=1.0)\n", + " \n", + " def forward(self, x):\n", + " # assume x contains bipolar {-1,1} elems\n", + " # shift from {-1,1} -> {0,1} since that is the\n", + " # input range for the trained network\n", + " x = (x + torch.tensor([1.0])) / 2.0 \n", + " out_original = self.pretrained(x)\n", + " out_final = self.qnt_output(out_original) # output as {-1,1} \n", + " return out_final\n", + "\n", + "model_for_export = CybSecMLPForExport(modified_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def test_padded_bipolar(model, test_loader): \n", + " # ensure model is in eval mode\n", + " model.eval() \n", + " y_true = []\n", + " y_pred = []\n", + " \n", + " with torch.no_grad():\n", + " for data in test_loader:\n", + " inputs, target = data\n", + " # pad inputs to 600 elements\n", + " input_padded = np.pad(inputs, [(0,0), (0,7)])\n", + " # convert inputs to {-1,+1}\n", + " input_scaled = 2*input_padded - 1\n", + " # run the model\n", + " output = model(torch.from_numpy(input_scaled).float())\n", + " y_pred.extend(list(output.flatten()))\n", + " # make targets bipolar {-1,+1}\n", + " expected = 2*target.float() - 1\n", + " expected = expected.detach().numpy()\n", + " y_true.extend(list(expected.flatten()))\n", + " \n", + " return accuracy_score(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9188772287810328" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_padded_bipolar(model_for_export, test_quantized_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export to FINN-ONNX <a id=\"export_finn_onnx\" ></a>\n", + "\n", + "FINN expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx)." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model saved to cybsec-mlp.onnx\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:15: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " from ipykernel import kernelapp as app\n" + ] + } + ], + "source": [ + "import brevitas.onnx as bo\n", + "\n", + "export_onnx_path = \"cybsec-mlp.onnx\"\n", + "input_shape = (1, 600)\n", + "bo.export_finn_onnx(model_for_export, input_shape, export_onnx_path)\n", + "\n", + "print(\"Model saved to %s\" % export_onnx_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View the Exported ONNX in Netron <a id=\"view_in_netron\" ></a>\n", + "\n", + "Let's examine the exported ONNX model with Netron. Particular things of note:\n", + "\n", + "* The input preprocessing (x + 1) / 2 is exported as part of the network (initial Add and Div layers)\n", + "* We've exported the padded version; shape of the first MatMul node's weight parameter is 600x64\n", + "* The weight parameters (second inputs) for MatMul nodes are annotated with `quantization: finn_datatype:INT2`\n", + "* The quantized activations are exported as `MultiThreshold` nodes with `domain=finn.custom_op.general`\n", + "* There's a final `MultiThreshold` node with threshold=0 to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Serving 'cybsec-mlp.onnx' at http://0.0.0.0:8081\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " <iframe\n", + " width=\"100%\"\n", + " height=\"400\"\n", + " src=\"http://0.0.0.0:8081/\"\n", + " frameborder=\"0\"\n", + " allowfullscreen\n", + " ></iframe>\n", + " " + ], + "text/plain": [ + "<IPython.lib.display.IFrame at 0x7f4045ac19e8>" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from finn.util.visualization import showInNetron\n", + "\n", + "showInNetron(export_onnx_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## That's it! <a id=\"thats_it\" ></a>\n", + "You created, trained and tested a quantized MLP that is ready to be loaded into FINN, congratulations! You can now proceed to the next notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/end2end_example/cybersecurity/2-export-to-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-export-to-finn-and-verify.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f48cada0dd25f08f1659a778d04785bda27f443e --- /dev/null +++ b/notebooks/end2end_example/cybersecurity/2-export-to-finn-and-verify.ipynb @@ -0,0 +1,483 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Verify Exported ONNX Model in FINN\n", + "\n", + "**Important: This notebook depends on the 1-train-mlp-with-brevitas notebook, because we are using the ONNX model that was exported there. So please make sure the needed .onnx file is generated before you run this notebook. Also remember to 'close and halt' any other FINN notebooks, since Netron visualizations use the same port.**\n", + "\n", + "In this notebook we will show how to import the network we trained in Brevitas and verify it in the FINN compiler. \n", + "This verification process can actually be done at various stages in the compiler [as explained in this notebook](../bnn-pynq/tfc_end2end_verification.ipynb) but for this example we'll only consider the first step: verifying the exported high-level FINN-ONNX model.\n", + "Once this model is sucessfully verified, we'll generate an FPGA accelerator from it in the next notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx \n", + "import torch " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**This is important -- always import onnx before torch**. This is a workaround for a [known bug](https://github.com/onnx/onnx/issues/2394)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "-------------\n", + "1. [Import model and visualize in Netron](#brevitas_import_visualization)\n", + "2. [Network preperations: Tidy up transformations](#network_preparations)\n", + "3. [Load the dataset and Brevitas model](#load_dataset) \n", + "4. [Compare FINN and Brevitas execution](#compare_brevitas)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Import model and visualize in Netron <a id=\"brevitas_import_visualization\"></a>\n", + "\n", + "Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#finn.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.core.modelwrapper import ModelWrapper\n", + "\n", + "model_file_path = \"cybsec-mlp.onnx\"\n", + "model_for_sim = ModelWrapper(model_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Serving 'cybsec-mlp.onnx' at http://0.0.0.0:8081\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " <iframe\n", + " width=\"100%\"\n", + " height=\"400\"\n", + " src=\"http://0.0.0.0:8081/\"\n", + " frameborder=\"0\"\n", + " allowfullscreen\n", + " ></iframe>\n", + " " + ], + "text/plain": [ + "<IPython.lib.display.IFrame at 0x7fc1fc950748>" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from finn.util.visualization import showInNetron\n", + "showInNetron(model_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Network preperation: Tidy up transformations <a id=\"network_preparations\"></a>\n", + "\n", + "Before running the verification, we need to prepare our FINN-ONNX model. In particular, all the intermediate tensors need to have statically defined shapes. To do this, we apply some transformations to the model like a kind of \"tidy-up\" to make it easier to process. You can read more about these transformations in [this notebook](../bnn-pynq/tfc_end2end_example.ipynb).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n", + "from finn.transformation.infer_shapes import InferShapes\n", + "from finn.transformation.infer_datatypes import InferDataTypes\n", + "from finn.transformation.fold_constants import FoldConstants\n", + "\n", + "model_for_sim = model_for_sim.transform(InferShapes())\n", + "model_for_sim = model_for_sim.transform(FoldConstants())\n", + "model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())\n", + "model_for_sim = model_for_sim.transform(GiveReadableTensorNames())\n", + "model_for_sim = model_for_sim.transform(InferDataTypes())\n", + "model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There's one more thing we'll do: we will mark the input tensor datatype as bipolar, which will be used by the compiler later on. \n", + "\n", + "*In the near future it will be possible to add this information to the model while exporting, instead of having to add it manually.*" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input tensor name: global_in\n", + "Output tensor name: global_out\n", + "Input tensor shape: [1, 600]\n", + "Input tensor datatype: DataType.BIPOLAR\n" + ] + } + ], + "source": [ + "from finn.core.datatype import DataType\n", + "\n", + "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n", + "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n", + "print(\"Input tensor name: %s\" % finnonnx_in_tensor_name)\n", + "print(\"Output tensor name: %s\" % finnonnx_out_tensor_name)\n", + "finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n", + "print(\"Input tensor shape: %s\" % str(finnonnx_model_in_shape))\n", + "model_for_sim.set_tensor_datatype(finnonnx_in_tensor_name, DataType.BIPOLAR)\n", + "print(\"Input tensor datatype: %s\" % str(model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)))\n", + "\n", + "verified_model_filename = \"cybsec-mlp-verified.onnx\"\n", + "model_for_sim.save(verified_model_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's view our ready-to-go model. Some changes to note:\n", + "\n", + "* all intermediate tensors now have their shapes specified (indicated by numbers next to the arrows going between layers)\n", + "* the datatype on the input tensor is set to DataType.BIPOLAR (click on the `global_in` node to view properties)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Stopping http://0.0.0.0:8081\n", + "Serving 'cybsec-mlp-verified.onnx' at http://0.0.0.0:8081\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " <iframe\n", + " width=\"100%\"\n", + " height=\"400\"\n", + " src=\"http://0.0.0.0:8081/\"\n", + " frameborder=\"0\"\n", + " allowfullscreen\n", + " ></iframe>\n", + " " + ], + "text/plain": [ + "<IPython.lib.display.IFrame at 0x7fc280154278>" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "showInNetron(verified_model_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Load the Dataset and the Brevitas Model <a id=\"load_dataset\"></a>\n", + "\n", + "We'll use some example data from the quantized UNSW-NB15 dataset (from the previous notebook) to use as inputs for the verification. \n", + "\n", + "Recall that the quantized values from the dataset are 593-bit binary {0, 1} vectors whereas our exported model takes 600-bit bipolar {-1, +1} vectors, so we'll have to preprocess it a bit before we can use it for verifying the ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 593])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from torch.utils.data import DataLoader, Dataset\n", + "from dataloader_quantized import UNSW_NB15_quantized\n", + "\n", + "test_quantized_dataset = UNSW_NB15_quantized(file_path_train='UNSW_NB15_training-set.csv', \\\n", + " file_path_test = \"UNSW_NB15_testing-set.csv\", \\\n", + " train=False)\n", + "\n", + "n_verification_inputs = 100\n", + "# last column is the label, exclude it\n", + "input_tensor = test_quantized_dataset.data[:n_verification_inputs,:-1]\n", + "input_tensor.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's also bring up the MLP we trained in Brevitas from the previous notebook. We'll compare its outputs to what is generated by FINN." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "IncompatibleKeys(missing_keys=[], unexpected_keys=[])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_size = 593 \n", + "hidden1 = 64 \n", + "hidden2 = 64\n", + "hidden3 = 64\n", + "weight_bit_width = 2\n", + "act_bit_width = 2\n", + "num_classes = 1\n", + "\n", + "from brevitas.nn import QuantLinear, QuantReLU\n", + "import torch.nn as nn\n", + "\n", + "brevitas_model = nn.Sequential(\n", + " QuantLinear(input_size, hidden1, bias=True, weight_bit_width=weight_bit_width),\n", + " nn.BatchNorm1d(hidden1),\n", + " nn.Dropout(0.5),\n", + " QuantReLU(bit_width=act_bit_width),\n", + " QuantLinear(hidden1, hidden2, bias=True, weight_bit_width=weight_bit_width),\n", + " nn.BatchNorm1d(hidden2),\n", + " nn.Dropout(0.5),\n", + " QuantReLU(bit_width=act_bit_width),\n", + " QuantLinear(hidden2, hidden3, bias=True, weight_bit_width=weight_bit_width),\n", + " nn.BatchNorm1d(hidden3),\n", + " nn.Dropout(0.5),\n", + " QuantReLU(bit_width=act_bit_width),\n", + " QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width)\n", + ")\n", + "\n", + "# replace this with your trained network checkpoint if you're not\n", + "# using the pretrained weights\n", + "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n", + "brevitas_model.load_state_dict(trained_state_dict, strict=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def inference_with_brevitas(current_inp):\n", + " brevitas_output = brevitas_model.forward(current_inp)\n", + " # apply sigmoid + threshold\n", + " brevitas_output = torch.sigmoid(brevitas_output)\n", + " brevitas_output = (brevitas_output.detach().numpy() > 0.5) * 1\n", + " # convert output to bipolar\n", + " brevitas_output = 2*brevitas_output - 1\n", + " return brevitas_output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Compare FINN & Brevitas execution <a id=\"compare_brevitas\"></a>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's make helper functions to execute the same input with Brevitas and FINN. For FINN, we'll use the [`finn.core.onnx_exec`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#finn.core.onnx_exec.execute_onnx) function to execute the exported FINN-ONNX on the inputs." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def inference_with_finn_onnx(current_inp):\n", + " # convert input to numpy for FINN\n", + " current_inp = current_inp.detach().numpy()\n", + " # add padding and re-scale to bipolar\n", + " current_inp = np.pad(current_inp, [(0, 0), (0, 7)])\n", + " current_inp = 2*current_inp-1\n", + " # reshape to expected input (add 1 for batch dimension)\n", + " current_inp = current_inp.reshape(finnonnx_model_in_shape)\n", + " # create the input dictionary\n", + " input_dict = {finnonnx_in_tensor_name : current_inp} \n", + " # run with FINN's execute_onnx\n", + " output_dict = oxe.execute_onnx(model_for_sim, input_dict)\n", + " #get the output tensor\n", + " finn_output = output_dict[finnonnx_out_tensor_name] \n", + " return finn_output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can call our inference helper functions for each input and compare the outputs." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ok 100 nok 0: 100%|██████████| 100/100 [00:48<00:00, 2.09it/s]\n" + ] + } + ], + "source": [ + "import finn.core.onnx_exec as oxe\n", + "import numpy as np\n", + "from tqdm import trange\n", + "\n", + "verify_range = trange(n_verification_inputs, desc=\"FINN execution\", position=0, leave=True)\n", + "brevitas_model.eval()\n", + "\n", + "ok = 0\n", + "nok = 0\n", + "\n", + "for i in verify_range:\n", + " # run in Brevitas with PyTorch tensor\n", + " current_inp = input_tensor[i].reshape((1, 593))\n", + " brevitas_output = inference_with_brevitas(current_inp)\n", + " finn_output = inference_with_finn_onnx(current_inp)\n", + " # compare the outputs\n", + " ok += 1 if finn_output == brevitas_output else 0\n", + " nok += 1 if finn_output != brevitas_output else 0\n", + " verify_range.set_description(\"ok %d nok %d\" % (ok, nok))\n", + " verify_range.refresh() # to show immediately the update" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\n" + ] + } + ], + "source": [ + "if ok == n_verification_inputs:\n", + " print(\"Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\")\n", + "else:\n", + " print(\"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This concludes our second notebook. In the next one, we'll take the ONNX model we just verified all the way down to FPGA hardware with the FINN compiler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1ee1cefbe17d96ffd7a2e6384e037e1d9fbdd989 --- /dev/null +++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb @@ -0,0 +1,804 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building the Streaming Dataflow Accelerator\n", + "\n", + "**Important: This notebook depends on the 2-cybersecurity-finn-verification notebook because we are using models that were created by these notebooks. So please make sure the needed .onnx files are generated prior to running this notebook.**\n", + "\n", + "<img align=\"left\" src=\"finn-example.png\" alt=\"drawing\" style=\"margin-right: 20px\" width=\"250\"/>\n", + "\n", + "In this notebook, we'll use the FINN compiler generate an FPGA accelerator with a streaming dataflow architecture from our quantized MLP for the cybersecurity task. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vivado HLS description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n", + "\n", + "These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "-------------\n", + "\n", + "1. [Introduction to `build_dataflow` Tool](#intro_build_dataflow) \n", + "2. [Understanding the Build Configuration: `DataflowBuildConfig`](#underst_build_conf) \n", + " 2.1.[Output Products](#output_prod) \n", + " 2.2.[Configuring the Board and FPGA Part](#config_fpga) \n", + " 2.3 [Configuring the Performance](#config_perf) \n", + "4. [Launch a Build: Only Estimate Reports](#build_estimate_report)\n", + "5. [Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance](#build_ip_synth_rtlsim)\n", + "6. [Launch a Build: PYNQ Bitfile and Driver](#build_bitfile_driver)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction to `build_dataflow` Tool <a id=\"intro_build_dataflow\"></a>\n", + "\n", + "Since version 0.5b, the FINN compiler has a `build_dataflow` tool. Compared to previous versions which required setting up all the needed transformations in a Python script, it makes experimenting with dataflow architecture generation easier. The core idea is to specify the relevant build info as a configuration `dict`, which invokes all the necessary steps to make the dataflow build happen. It can be invoked either from the [command line](https://finn-dev.readthedocs.io/en/latest/command_line.html) or with a single Python function call\n", + "\n", + "\n", + "In this notebook, we'll use the Python function call to invoke the builds to stay inside the Jupyter notebook, but feel free to experiment with reproducing what we do here with the `./run-docker.sh build_dataflow` and `./run-docker.sh build_custom` command-line entry points too, as documented [here]((https://finn-dev.readthedocs.io/en/latest/command_line.html))." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding the Build Configuration: `DataflowBuildConfig` <a id=\"underst_build_conf\"></a>\n", + "\n", + "The build configuration is specified by an instance of `finn.builder.build_dataflow_config.DataflowBuildConfig`. The configuration is a Python [`dataclass`](https://docs.python.org/3/library/dataclasses.html) which can be serialized into or de-serialized from JSON files for persistence, although we'll just set it up in Python here.\n", + "There are many options in the configuration to customize different aspects of the build, we'll only cover a few of them in this notebook. You can read the details on all the config options on [the FINN API documentation](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowBuildConfig).\n", + "\n", + "Let's go over some of the members of the `DataflowBuildConfig`:\n", + "\n", + "### Output Products <a id=\"output_prod\"></a>\n", + "\n", + "The build can produce many different outputs, and some of them can take a long time (e.g. bitfile synthesis for a large network). When you first start working on generating a new accelerator and exploring the different performance options, you may not want to go all the way to a bitfile. Thus, in the beginning you may just select the estimate reports as the output products. Gradually, you can generate the output products from later stages until you are happy enough with the design to build the full accelerator integrated into a shell.\n", + "\n", + "The output products are controlled by:\n", + "\n", + "* `generate_outputs`: list of output products (of type [`finn.builder.build_dataflow_config.DataflowOutputType`](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowOutputType)) that will be generated by the build. Some available options are:\n", + " - `ESTIMATE_REPORTS` : report expected resources and performance per layer and for the whole network without any synthesis\n", + " - `STITCHED_IP` : create a stream-in stream-out IP design that can be integrated into other Vivado IPI or RTL designs\n", + " - `RTLSIM_PERFORMANCE` : use PyVerilator to do a performance/latency test of the `STITCHED_IP` design\n", + " - `OOC_SYNTH` : run out-of-context synthesis (just the accelerator itself, without any system surrounding it) on the `STITCHED_IP` design to get post-synthesis FPGA resources and achievable clock frequency\n", + " - `BITFILE` : integrate the accelerator into a shell to produce a standalone bitfile\n", + " - `PYNQ_DRIVER` : generate a PYNQ Python driver that can be used to launch the accelerator\n", + " - `DEPLOYMENT_PACKAGE` : create a folder with the `BITFILE` and `PYNQ_DRIVER` outputs, ready to be copied to the target FPGA platform.\n", + "* `output_dir`: the directory where the all the generated build outputs above will be written into.\n", + "* `steps`: list of predefined (or custom) build steps FINN will go through. Use `build_dataflow_config.estimate_only_dataflow_steps` to execute only the steps needed for estimation (without any synthesis), and the `build_dataflow_config.default_build_dataflow_steps` otherwise (which is the default value).\n", + "\n", + "### Configuring the Board and FPGA Part <a id=\"config_fpga\"></a>\n", + "\n", + "* `fpga_part`: Xilinx FPGA part to be used for synthesis, can be left unspecified to be inferred from `board` below, or specified explicitly for e.g. out-of-context synthesis.\n", + "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/basic.py#L41) for a list of possible boards.\n", + "* `shell_flow_type`: the target [shell flow type](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.ShellFlowType), only needed for generating full bitfiles where the FINN design is integrated into a shell (so only needed if `BITFILE` is selected) \n", + "\n", + "### Configuring the Performance <a id=\"config_perf\"></a>\n", + "\n", + "You can configure the performance (and correspondingly, the FPGA resource footprint) of the generated in two ways:\n", + "\n", + "1) (basic) Set a target performance and let the compiler figure out the per-node parallelization settings.\n", + "\n", + "2) (advanced) Specify a separate .json as `folding_config_file` that lists the degree of parallelization (as well as other hardware options) for each layer.\n", + "\n", + "This notebook only deals with the basic approach, for which you need to set up:\n", + "\n", + "* `target_fps`: target inference performance in frames per second. Note that target may not be achievable due to specific layer constraints, or due to resource limitations of the FPGA.\n", + "* `synth_clk_period_ns`: target clock frequency (in nanoseconds) for Vivado synthesis. e.g. `synth_clk_period_ns=5.0` will target a 200 MHz clock. Note that the target clock period may not be achievable depending on the FPGA part and design complexity." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch a Build: Only Estimate Reports <a id=\"build_estimate_report\"></a>\n", + "\n", + "First, we'll launch a build that only generates the estimate reports, which does not require any synthesis. Note two things below: how the `generate_outputs` only contains `ESTIMATE_REPORTS`, but also how the `steps` uses a value of `estimate_only_dataflow_steps`. This skips steps like HLS synthesis to provide a quick estimate from analytical models." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Building dataflow accelerator from cybsec-mlp-verified.onnx\n", + "Intermediate outputs will be generated in /tmp/finn_dev_osboxes\n", + "Final outputs will be generated in output_estimates_only\n", + "Build log is at output_estimates_only/build_dataflow.log\n", + "Running step: step_tidy_up [1/7]\n", + "Running step: step_streamline [2/7]\n", + "Running step: step_convert_to_hls [3/7]\n", + "Running step: step_create_dataflow_partition [4/7]\n", + "Running step: step_target_fps_parallelization [5/7]\n", + "Running step: step_apply_folding_config [6/7]\n", + "Running step: step_generate_estimate_reports [7/7]\n", + "Completed successfully\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import finn.builder.build_dataflow as build\n", + "import finn.builder.build_dataflow_config as build_cfg\n", + "\n", + "model_file = \"cybsec-mlp-verified.onnx\"\n", + "\n", + "estimates_output_dir = \"output_estimates_only\"\n", + "\n", + "cfg = build.DataflowBuildConfig(\n", + " output_dir = estimates_output_dir,\n", + " target_fps = 1000000,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_cfg.estimate_only_dataflow_steps,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")\n", + "\n", + "build.build_dataflow_cfg(model_file, cfg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll now examine the generated outputs from this build. If we look under the outputs directory, we'll find a subfolder with the generated estimate reports." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "build_dataflow.log intermediate_models report time_per_step.json\r\n" + ] + } + ], + "source": [ + "! ls {estimates_output_dir}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "estimate_layer_config_alternatives.json estimate_network_performance.json\r\n", + "estimate_layer_cycles.json\t\t op_and_param_counts.json\r\n", + "estimate_layer_resources.json\r\n" + ] + } + ], + "source": [ + "! ls {estimates_output_dir}/report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that various reports have been generated as .json files. Let's examine the contents of the `estimate_network_performance.json` for starters. Here, we can see the analytical estimates for the performance and latency." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\r\n", + " \"critical_path_cycles\": 272,\r\n", + " \"max_cycles\": 80,\r\n", + " \"max_cycles_node_name\": \"StreamingFCLayer_Batch_0\",\r\n", + " \"estimated_throughput_fps\": 1250000.0,\r\n", + " \"estimated_latency_ns\": 2720.0\r\n", + "}" + ] + } + ], + "source": [ + "! cat {estimates_output_dir}/report/estimate_network_performance.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since all of these reports are .json files, we can easily load them into Python for further processing. Let's define a helper function and look at the `estimate_layer_cycles.json` report." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "def read_json_dict(filename):\n", + " with open(filename, \"r\") as f:\n", + " ret = json.load(f)\n", + " return ret" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'StreamingFCLayer_Batch_0': 80,\n", + " 'StreamingFCLayer_Batch_1': 64,\n", + " 'StreamingFCLayer_Batch_2': 64,\n", + " 'StreamingFCLayer_Batch_3': 64}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_json_dict(estimates_output_dir + \"/report/estimate_layer_cycles.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we can see the estimated number of clock cycles each layer will take. Recall that all of these layers will be running in parallel, and the slowest layer will determine the overall throughput of the entire neural network. FINN attempts to parallelize each layer such that they all take a similar number of cycles, and less than the corresponding number of cycles that would be required to meet `target_fps`.\n", + "\n", + "Finally, we can see the layer-by-layer resource estimates in the `estimate_layer_resources.json` report:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'StreamingFCLayer_Batch_0': {'BRAM_18K': 27,\n", + " 'BRAM_efficiency': 0.15432098765432098,\n", + " 'LUT': 8149,\n", + " 'URAM': 0,\n", + " 'URAM_efficiency': 1,\n", + " 'DSP': 0},\n", + " 'StreamingFCLayer_Batch_1': {'BRAM_18K': 4,\n", + " 'BRAM_efficiency': 0.1111111111111111,\n", + " 'LUT': 1435,\n", + " 'URAM': 0,\n", + " 'URAM_efficiency': 1,\n", + " 'DSP': 0},\n", + " 'StreamingFCLayer_Batch_2': {'BRAM_18K': 4,\n", + " 'BRAM_efficiency': 0.1111111111111111,\n", + " 'LUT': 1435,\n", + " 'URAM': 0,\n", + " 'URAM_efficiency': 1,\n", + " 'DSP': 0},\n", + " 'StreamingFCLayer_Batch_3': {'BRAM_18K': 1,\n", + " 'BRAM_efficiency': 0.006944444444444444,\n", + " 'LUT': 341,\n", + " 'URAM': 0,\n", + " 'URAM_efficiency': 1,\n", + " 'DSP': 0},\n", + " 'total': {'BRAM_18K': 36.0, 'LUT': 11360.0, 'URAM': 0.0, 'DSP': 0.0}}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_json_dict(estimates_output_dir + \"/report/estimate_layer_resources.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This particular report is useful to determine whether the current configuration will fit into a particular FPGA. If you see that the resource requirements are too high for the FPGA you had in mind, you should consider lowering the `target_fps`.\n", + "\n", + "*Note that the analytical models tend to over-estimate how much resources are needed, since they can't capture the effects of various synthesis optimizations.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance <a id=\"build_ip_synth_rtlsim\"></a>\n", + "\n", + "Once we have a configuration that gives satisfactory estimates, we can move on to generating the accelerator. We can do this in different ways depending on how we want to integrate the accelerator into a larger system. For instance, if we have a larger streaming system built in Vivado or if we'd like to re-use this generated accelerator as an IP component in other projects, the `STITCHED_IP` output product is a good choice. We can also use the `OOC_SYNTH` output product to get post-synthesis resource and clock frequency numbers for our accelerator." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Building dataflow accelerator from cybsec-mlp-verified.onnx\n", + "Intermediate outputs will be generated in /tmp/finn_dev_osboxes\n", + "Final outputs will be generated in output_ipstitch_ooc_rtlsim\n", + "Build log is at output_ipstitch_ooc_rtlsim/build_dataflow.log\n", + "Running step: step_tidy_up [1/15]\n", + "Running step: step_streamline [2/15]\n", + "Running step: step_convert_to_hls [3/15]\n", + "Running step: step_create_dataflow_partition [4/15]\n", + "Running step: step_target_fps_parallelization [5/15]\n", + "Running step: step_apply_folding_config [6/15]\n", + "Running step: step_generate_estimate_reports [7/15]\n", + "Running step: step_hls_ipgen [8/15]\n", + "Running step: step_set_fifo_depths [9/15]\n", + "Running step: step_create_stitched_ip [10/15]\n", + "Running step: step_measure_rtlsim_performance [11/15]\n", + "Running step: step_make_pynq_driver [12/15]\n", + "Running step: step_out_of_context_synthesis [13/15]\n", + "Running step: step_synthesize_bitfile [14/15]\n", + "Running step: step_deployment_package [15/15]\n", + "Completed successfully\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import finn.builder.build_dataflow as build\n", + "import finn.builder.build_dataflow_config as build_cfg\n", + "\n", + "model_file = \"cybsec-mlp-verified.onnx\"\n", + "\n", + "rtlsim_output_dir = \"output_ipstitch_ooc_rtlsim\"\n", + "\n", + "cfg = build.DataflowBuildConfig(\n", + " output_dir = rtlsim_output_dir,\n", + " target_fps = 1000000,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.STITCHED_IP,\n", + " build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,\n", + " build_cfg.DataflowOutputType.OOC_SYNTH,\n", + " ]\n", + ")\n", + "\n", + "build.build_dataflow_cfg(model_file, cfg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Among the output products, we will find the accelerator exported as IP:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "all_verilog_srcs.txt\t\t finn_vivado_stitch_proj.xpr\r\n", + "finn_vivado_stitch_proj.cache\t ip\r\n", + "finn_vivado_stitch_proj.hbs\t make_project.sh\r\n", + "finn_vivado_stitch_proj.hw\t make_project.tcl\r\n", + "finn_vivado_stitch_proj.ip_user_files vivado.jou\r\n", + "finn_vivado_stitch_proj.srcs\t vivado.log\r\n" + ] + } + ], + "source": [ + "! ls {rtlsim_output_dir}/stitched_ip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also have a few reports generated by these output products, different from the ones generated by `ESTIMATE_REPORTS`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "estimate_layer_resources_hls.json rtlsim_performance.json\r\n", + "ooc_synth_and_timing.json\r\n" + ] + } + ], + "source": [ + "! ls {rtlsim_output_dir}/report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In `ooc_synth_and_timing.json` we can find the post-synthesis and maximum clock frequency estimate for the accelerator. Note that the clock frequency estimate here tends to be optimistic, since out-of-context synthesis is less constrained." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\r\n", + " \"vivado_proj_folder\": \"/tmp/finn_dev_osboxes/synth_out_of_context_wy3b6qf4/results_finn_design_wrapper\",\r\n", + " \"LUT\": 7073.0,\r\n", + " \"FF\": 7534.0,\r\n", + " \"DSP\": 0.0,\r\n", + " \"BRAM\": 18.0,\r\n", + " \"WNS\": 0.632,\r\n", + " \"\": 0,\r\n", + " \"fmax_mhz\": 106.7463706233988,\r\n", + " \"estimated_throughput_fps\": 1334329.6327924852\r\n", + "}" + ] + } + ], + "source": [ + "! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In `rtlsim_performance.json` we can find the steady-state throughput and latency for the accelerator, as obtained by rtlsim. If the DRAM bandwidth numbers reported here are below what the hardware platform is capable of (i.e. the accelerator is not memory-bound), you can expect the same steady-state throughput in real hardware." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\r\n", + " \"cycles\": 838,\r\n", + " \"runtime[ms]\": 0.00838,\r\n", + " \"throughput[images/s]\": 954653.9379474939,\r\n", + " \"DRAM_in_bandwidth[Mb/s]\": 71.59904534606204,\r\n", + " \"DRAM_out_bandwidth[Mb/s]\": 0.11933174224343673,\r\n", + " \"fclk[mhz]\": 100.0,\r\n", + " \"N\": 8,\r\n", + " \"latency_cycles\": 229\r\n", + "}" + ] + } + ], + "source": [ + "! cat {rtlsim_output_dir}/report/rtlsim_performance.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's have a look at `final_hw_config.json`. This is the node-by-node hardware configuration determined by the FINN compiler, including FIFO depths, parallelization settings (PE/SIMD) and others. If you want to optimize your build further (the \"advanced\" method we mentioned under \"Configuring the performance\"), you can use this .json file as the `folding_config_file` for a new run to use it as a starting point for further exploration and optimizations." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\r\n", + " \"Defaults\": {},\r\n", + " \"StreamingFIFO_0\": {\r\n", + " \"ram_style\": \"auto\",\r\n", + " \"depth\": 32,\r\n", + " \"impl_style\": \"rtl\"\r\n", + " },\r\n", + " \"StreamingFCLayer_Batch_0\": {\r\n", + " \"PE\": 32,\r\n", + " \"SIMD\": 15,\r\n", + " \"ram_style\": \"auto\",\r\n", + " \"resType\": \"lut\",\r\n", + " \"mem_mode\": \"decoupled\",\r\n", + " \"runtime_writeable_weights\": 0\r\n", + " },\r\n", + " \"StreamingDataWidthConverter_Batch_0\": {\r\n", + " \"impl_style\": \"hls\"\r\n", + " },\r\n", + " \"StreamingFCLayer_Batch_1\": {\r\n", + " \"PE\": 4,\r\n", + " \"SIMD\": 16,\r\n", + " \"ram_style\": \"auto\",\r\n", + " \"resType\": \"lut\",\r\n", + " \"mem_mode\": \"decoupled\",\r\n", + " \"runtime_writeable_weights\": 0\r\n", + " },\r\n", + " \"StreamingDataWidthConverter_Batch_1\": {\r\n", + " \"impl_style\": \"hls\"\r\n", + " },\r\n", + " \"StreamingFCLayer_Batch_2\": {\r\n", + " \"PE\": 4,\r\n", + " \"SIMD\": 16,\r\n", + " \"ram_style\": \"auto\",\r\n", + " \"resType\": \"lut\",\r\n", + " \"mem_mode\": \"decoupled\",\r\n", + " \"runtime_writeable_weights\": 0\r\n", + " },\r\n", + " \"StreamingDataWidthConverter_Batch_2\": {\r\n", + " \"impl_style\": \"hls\"\r\n", + " },\r\n", + " \"StreamingFCLayer_Batch_3\": {\r\n", + " \"PE\": 1,\r\n", + " \"SIMD\": 1,\r\n", + " \"ram_style\": \"auto\",\r\n", + " \"resType\": \"lut\",\r\n", + " \"mem_mode\": \"decoupled\",\r\n", + " \"runtime_writeable_weights\": 0\r\n", + " }\r\n", + "}" + ] + } + ], + "source": [ + "! cat {rtlsim_output_dir}/final_hw_config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch a Build: PYNQ Bitfile and Driver <a id=\"build_bitfile_driver\"></a>" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Building dataflow accelerator from cybsec-mlp-verified.onnx\n", + "Intermediate outputs will be generated in /tmp/finn_dev_osboxes\n", + "Final outputs will be generated in output_final\n", + "Build log is at output_final/build_dataflow.log\n", + "Running step: step_tidy_up [1/15]\n", + "Running step: step_streamline [2/15]\n", + "Running step: step_convert_to_hls [3/15]\n", + "Running step: step_create_dataflow_partition [4/15]\n", + "Running step: step_target_fps_parallelization [5/15]\n", + "Running step: step_apply_folding_config [6/15]\n", + "Running step: step_generate_estimate_reports [7/15]\n", + "Running step: step_hls_ipgen [8/15]\n", + "Running step: step_set_fifo_depths [9/15]\n", + "Running step: step_create_stitched_ip [10/15]\n", + "Running step: step_measure_rtlsim_performance [11/15]\n", + "Running step: step_make_pynq_driver [12/15]\n", + "Running step: step_out_of_context_synthesis [13/15]\n", + "Running step: step_synthesize_bitfile [14/15]\n", + "Running step: step_deployment_package [15/15]\n", + "Completed successfully\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import finn.builder.build_dataflow as build\n", + "import finn.builder.build_dataflow_config as build_cfg\n", + "\n", + "model_file = \"cybsec-mlp-verified.onnx\"\n", + "\n", + "final_output_dir = \"output_final\"\n", + "\n", + "cfg = build.DataflowBuildConfig(\n", + " output_dir = final_output_dir,\n", + " target_fps = 1000000,\n", + " synth_clk_period_ns = 10.0,\n", + " board = \"Pynq-Z1\",\n", + " shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.BITFILE,\n", + " build_cfg.DataflowOutputType.PYNQ_DRIVER,\n", + " build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,\n", + " ]\n", + ")\n", + "\n", + "build.build_dataflow_cfg(model_file, cfg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For our final build, the output products include the bitfile (and the accompanying .hwh file, also needed to execute correctly on PYNQ for Zynq platforms):" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "finn-accel.bit\tfinn-accel.hwh\r\n" + ] + } + ], + "source": [ + "! ls {final_output_dir}/bitfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The generated Python driver lets us execute the accelerator on PYNQ platforms with simply numpy i/o. You can find some notebooks showing how to use FINN-generated accelerators at runtime in the [finn-examples](https://github.com/Xilinx/finn-examples) repository." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "driver.py driver_base.py finn runtime_weights validate.py\r\n" + ] + } + ], + "source": [ + "! ls {final_output_dir}/driver" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The reports folder contains the post-synthesis resource and timing reports:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "estimate_layer_resources_hls.json post_synth_resources.xml\r\n", + "post_route_timing.rpt\r\n" + ] + } + ], + "source": [ + "! ls {final_output_dir}/report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we have the `deploy` folder which contains everything you need to copy onto the target board to get the accelerator running:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bitfile driver\r\n" + ] + } + ], + "source": [ + "! ls {final_output_dir}/deploy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/end2end_example/cybersecurity/README.md b/notebooks/end2end_example/cybersecurity/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ad8a7ad602a766dad3d37f2d4e0719009b30c187 --- /dev/null +++ b/notebooks/end2end_example/cybersecurity/README.md @@ -0,0 +1,21 @@ +# Training and Deploying a Quantized MLP + +In this folder you will find a series of notebooks that guide you through +the process of training a highly quantized neural network (QNN) and generating +a high-performance streaming dataflow accelerator from it using the FINN +compiler. +If you'd like to train your own QNNs and deploy them using FINN, this is a +good starting point. + +Here, the example application is classifying network packets as malicious or +not by training a multi-layer perceptron (MLP) on the UNSW-NB15 dataset. +We recommend following these notebooks in the order they appear: + +1. Training a few-bit MLP on the UNSW-NB15 dataset +2. Exporting the trained network and verify that it works as intended +3. Generating a streaming dataflow accelerator using the FINN compiler + +Note: This tutorial abstract away the internal details of the steps to provide +a simpler introduction. If you'd like to understand more of the internal +details of what happens during the accelerator build, we recommend the +(BNN-PYNQ end-to-end notebooks)[../bnn-pynq]. diff --git a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py new file mode 100644 index 0000000000000000000000000000000000000000..45651faa5a9a57e9a1d0d784b15ebe8945d9ddd7 --- /dev/null +++ b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py @@ -0,0 +1,406 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import pandas as pd +import numpy as np +from sklearn import preprocessing +from sklearn.preprocessing import OneHotEncoder +import math + +# quantize the UNSW_NB15 dataset and convert it to binary vectors +# reimplementation +# paper: https://ev.fe.uni-lj.si/1-2-2019/Murovic.pdf +# original matlab code: https://git.io/JLLdN + + +class UNSW_NB15_quantized(torch.utils.data.Dataset): + def __init__( + self, + file_path_train, + file_path_test, + quantization=True, + onehot=False, + train=True, + ): + + self.dataframe = ( + pd.concat([pd.read_csv(file_path_train), pd.read_csv(file_path_test)]) + .reset_index() + .drop(columns=["index", "id", "attack_cat"]) + ) + + if onehot: + self.one_hot_df_encoded = self.one_hot_encoding(self.dataframe) + + if quantization: + _, self.train_df, self.test_df = self.quantize_df(self.dataframe) + + if train: + self.data = torch.FloatTensor(self.train_df.astype("float")) + else: + self.data = torch.FloatTensor(self.test_df.astype("float")) + + def get_dataframe(self): + return self.dataframe + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + target = self.data[index][-1] + data_val = self.data[index][:-1] + return data_val, target + + def dec2bin( + self, column: pd.Series, number_of_bits: int, left_msb: bool = True + ) -> pd.Series: + """Convert a decimal pd.Series to binary pd.Series with numbers in their + # base-2 equivalents. + The output is a numpy nd array. + # adapted from: https://stackoverflow.com/q/51471097/1520469 + Parameters + ---------- + column: pd.Series + Series wit all decimal numbers that will be cast to binary + number_of_bits: str + The desired number of bits for the binary number. If bigger than + what is needed then those bits will be 0. + The number_of_bits should be >= than what is needed to express the + largest decimal input + left_msb: bool + Specify that the most significant digit is the leftmost element. + If this is False, it will be the rightmost element. + Returns + ------- + numpy.ndarray + Numpy array with all elements in binary representation of the input. + + """ + + def my_binary_repr(number, nbits): + return np.binary_repr(number, nbits)[::-1] + + func = my_binary_repr if left_msb else np.binary_repr + + return np.vectorize(func)(column.values, number_of_bits) + + def round_like_matlab_number(self, n: np.float64) -> int: + """Round the input "n" like matlab uint32(n) cast (which also rounds) e.g. + 0.5->1; 1.5->2; 2.3->2; 2.45->2 """ + if n - math.floor(n) < 0.5: + return math.floor(n) + return math.ceil(n) + + def round_like_matlab_series(self, series: pd.Series) -> pd.Series: + rounded_values_list = [] + for value in series: + rounded_values_list.append(self.round_like_matlab_number(value)) + return pd.Series(rounded_values_list) + + def integer_encoding(self, df): + """Applies integer encoding to the object columns of the dataframe""" + le = preprocessing.LabelEncoder() + for column in df.select_dtypes("object").columns.tolist(): + df[column] = le.fit_transform(df[column]) + return df + + def quantize_df(self, df): + """Quantized the input dataframe. The scaling is done by multiplying + every column by the inverse of the minimum of that column""" + # gets the smallest positive number of a vector + def get_min_positive_number(vector): + return vector[vector > 0].min() + + # computes the maximum required bits necessary to represent each number + # from a vector of numbers + def get_max_bits(vector): + return math.ceil(math.log2(float(vector.max()) + 1.0)) + + # splits a string into a list of all characters + def char_split(s): + return np.array([ch for ch in s]) + + df_encoded = self.integer_encoding(df) + python_quantized_df = df_encoded.copy() + dict_correct_rate_values = { + 715: 34716, + 11691: 25278, + 27417: 5259117, + 45319: 60744, + 73620: 9039, + 74498: 15070, + 86933: 1024485, + 89021: 1689027, + 90272: 5259117, + 103372: 1562102, + 118192: 1759777, + 122489: 246327, + 159266: 18853, + 190473: 18423, + } + + for column in python_quantized_df.columns: + column_data = df_encoded[column] + + m = get_min_positive_number(column_data) + m_inv = 1.0 / m + if m_inv > 1: + column_data = column_data * np.float64(m_inv) + + maxbits = get_max_bits(column_data) + # CLIP, ROUND and CAST to UINT32 + column_data = np.clip( + column_data, 0, 4294967295 + ) # clip due to overflow of uint32 of matlab code + column_data = self.round_like_matlab_series( + column_data + ) # round like matlab + column_data = column_data.astype(np.uint32) # cast like matlab + + if column == "rate": + column_data.update(pd.Series(dict_correct_rate_values)) + + python_quantized_df[column] = ( + self.dec2bin(column_data, maxbits, left_msb=False) + .reshape((-1, 1)) + .flatten() + ) + + for column in python_quantized_df.columns: + python_quantized_df[column] = ( + python_quantized_df[column].apply(char_split).values + ) + + python_quantized_df_separated = pd.DataFrame( + np.column_stack(python_quantized_df.values.T.tolist()) + ) + python_train = python_quantized_df_separated.iloc[:175341] + python_test = python_quantized_df_separated.iloc[175341:] + + return ( + python_quantized_df_separated.values, + python_train.values, + python_test.values, + ) + + def one_hot_encoding(self, df): + dataframe = df.copy() + """Applies 1 hot encoding to the proto, service and state columns """ + + string_columns = ["proto", "service", "state"] + string_categories = [ + [ + [ + "tcp", + "udp", + "arp", + "ospf", + "icmp", + "igmp", + "rtp", + "ddp", + "ipv6-frag", + "cftp", + "wsn", + "pvp", + "wb-expak", + "mtp", + "pri-enc", + "sat-mon", + "cphb", + "sun-nd", + "iso-ip", + "xtp", + "il", + "unas", + "mfe-nsp", + "3pc", + "ipv6-route", + "idrp", + "bna", + "swipe", + "kryptolan", + "cpnx", + "rsvp", + "wb-mon", + "vmtp", + "ib", + "dgp", + "eigrp", + "ax.25", + "gmtp", + "pnni", + "sep", + "pgm", + "idpr-cmtp", + "zero", + "rvd", + "mobile", + "narp", + "fc", + "pipe", + "ipcomp", + "ipv6-no", + "sat-expak", + "ipv6-opts", + "snp", + "ipcv", + "br-sat-mon", + "ttp", + "tcf", + "nsfnet-igp", + "sprite-rpc", + "aes-sp3-d", + "sccopmce", + "sctp", + "qnx", + "scps", + "etherip", + "aris", + "pim", + "compaq-peer", + "vrrp", + "iatp", + "stp", + "l2tp", + "srp", + "sm", + "isis", + "smp", + "fire", + "ptp", + "crtp", + "sps", + "merit-inp", + "idpr", + "skip", + "any", + "larp", + "ipip", + "micp", + "encap", + "ifmp", + "tp++", + "a/n", + "ipv6", + "i-nlsp", + "ipx-n-ip", + "sdrp", + "tlsp", + "gre", + "mhrp", + "ddx", + "ippc", + "visa", + "secure-vmtp", + "uti", + "vines", + "crudp", + "iplt", + "ggp", + "ip", + "ipnip", + "st2", + "argus", + "bbn-rcc", + "egp", + "emcon", + "igp", + "nvp", + "pup", + "xnet", + "chaos", + "mux", + "dcn", + "hmp", + "prm", + "trunk-1", + "xns-idp", + "leaf-1", + "leaf-2", + "rdp", + "irtp", + "iso-tp4", + "netblt", + "trunk-2", + "cbt", + ] + ], + [ + [ + "-", + "ftp", + "smtp", + "snmp", + "http", + "ftp-data", + "dns", + "ssh", + "radius", + "pop3", + "dhcp", + "ssl", + "irc", + ] + ], + [ + [ + "FIN", + "INT", + "CON", + "ECO", + "REQ", + "RST", + "PAR", + "URN", + "no", + "ACC", + "CLO", + ] + ], + ] + + for column, categories in zip(string_columns, string_categories): + column_df = dataframe.loc[:, [column]] + + one_hot_encoder = OneHotEncoder(sparse=False, categories=categories) + # Fit OneHotEncoder to dataframe + one_hot_encoder.fit(column_df) + # Transform the dataframe + column_df_encoded = one_hot_encoder.transform(column_df) + # Create dataframe from the 2-d array + column_df_encoded = pd.DataFrame( + data=column_df_encoded, columns=one_hot_encoder.categories_[0] + ) + dataframe = pd.concat([column_df_encoded, dataframe], axis=1, sort=False) + + # delete proto,service and state columns + dataframe = dataframe.drop(string_columns, 1) + + return dataframe diff --git a/notebooks/end2end_example/cybersecurity/finn-example.png b/notebooks/end2end_example/cybersecurity/finn-example.png new file mode 100644 index 0000000000000000000000000000000000000000..b9335f720151ea64d9ae70cdf4d4c27dabec6f74 Binary files /dev/null and b/notebooks/end2end_example/cybersecurity/finn-example.png differ diff --git a/notebooks/end2end_example/cybersecurity/state_dict.pth b/notebooks/end2end_example/cybersecurity/state_dict.pth new file mode 100644 index 0000000000000000000000000000000000000000..53c002e3fa6f2ae3e7c8f0abb71fa446d80a8f09 Binary files /dev/null and b/notebooks/end2end_example/cybersecurity/state_dict.pth differ diff --git a/notebooks/end2end_example/mobilenet_end2end_example.ipynb b/notebooks/end2end_example/mobilenet/mobilenet_end2end_example.ipynb similarity index 100% rename from notebooks/end2end_example/mobilenet_end2end_example.ipynb rename to notebooks/end2end_example/mobilenet/mobilenet_end2end_example.ipynb diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 6fe242cc4d315b90aaf82acc19d681a322a9451b..ab7cde7e0c96ed17e3b93655b2f0171673f9f7a9 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -119,6 +119,17 @@ default_build_dataflow_steps = [ "step_deployment_package", ] +#: List of steps to run for an estimate-only (no synthesis) dataflow build +estimate_only_dataflow_steps = [ + "step_tidy_up", + "step_streamline", + "step_convert_to_hls", + "step_create_dataflow_partition", + "step_target_fps_parallelization", + "step_apply_folding_config", + "step_generate_estimate_reports", +] + @dataclass_json @dataclass diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 37d6192396b819dd3a0d23a7b779c11d9f756bc7..02912b2d5f45b3bab0eaca13ee0a0bf19bf9cfca 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -323,7 +323,7 @@ class HLSCustomOp(CustomOp): ip_path = ipgen_path + "/sol1/impl/ip" assert os.path.isdir( ip_path - ), "IPGen failed: %s not found. Check log under %s" % (ip_path, ipgen_path) + ), "IPGen failed: %s not found. Check log under %s" % (ip_path, code_gen_dir) self.set_nodeattr("ip_path", ip_path) vlnv = "xilinx.com:hls:%s:1.0" % node.name self.set_nodeattr("ip_vlnv", vlnv) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 46f07b0cc1ed957581416681d00326619439e9bd..94305b861cbe0c5e6b641c9dccee7976c73c236f 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -574,8 +574,9 @@ class InferQuantizedStreamingFCLayer(Transformation): thresholds neither 1 nor MH.""" odt = model.get_tensor_datatype(mt_output) scale = getCustomOp(consumer).get_nodeattr("out_scale") + bipolar_ok = odt == DataType.BIPOLAR and scale == 2.0 assert ( - scale == 1.0 + scale == 1.0 or bipolar_ok ), "out_scale must be equal to 1.0 for HLS conversion." actval = getCustomOp(consumer).get_nodeattr("out_bias") assert ( diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index e26e92391edd8ac420e89c72fb34c5554c601967..0f2b8ef6a4c0858cd98218538930c97c6df2ad9d 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -17,10 +17,14 @@ def _is_dwc_node(node): def _suitable_node(node): if node is not None: if is_fpgadataflow_node(node) is True: - if _is_dwc_node(node) is False: - return True - else: + if _is_dwc_node(node): + # no DWC for DWCs + return False + elif node.op_type == "IODMA": + # IODMA data shapes/widths need special handling return False + else: + return True else: return False else: @@ -28,8 +32,7 @@ def _suitable_node(node): class InsertDWC(Transformation): - """Ensure that the graph is terminated with a TLastMarker node, inserting - one if necessary.""" + """Add data width converters between layers where necessary.""" def __init__(self): super().__init__() diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 6abba70619089c30e187f8ee7c8128725509c030..9a797bf0243d6c4f395bb9ec50bbf675cce89cbd 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -85,14 +85,19 @@ class InsertIODMA(Transformation): ), "Data layout of output tensor must be NHWC or NC" out_shape = model.get_tensor_shape(graph_out_name) out_dtype = model.get_tensor_datatype(graph_out_name) + final_node_inst = getCustomOp(final_node) + out_folded_shape = final_node_inst.get_folded_output_shape() + # take advantage of AXI stream width padding for DMA alignment + # (AXI streams are always padded to 8 bits) + # this is the width of stream input to DMA + padded_outstream_width = final_node_inst.get_outstream_width_padded() + padded_outstream_bytes = padded_outstream_width // 8 # determine the feasible interface width - transfer_bits = np.prod(out_shape) * out_dtype.bitwidth() + transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0 ), "No feasible interface width for transfer size" - # get width of stream input to DMA - streamWidth = getCustomOp(final_node).get_outstream_width() # make new buffer final_node_out = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape @@ -101,15 +106,17 @@ class InsertIODMA(Transformation): model.set_tensor_datatype(final_node_out.name, out_dtype) # reroute final node output to final_node_out_name final_node.output[0] = final_node_out.name + # FIXME: currently always using 8-bit dtypes to work around the + # padding problems for i/o DMA dma_node = oh.make_node( "IODMA", [final_node_out.name], [graph_out_name], - numInputVectors=out_shape[:-1], - NumChannels=out_shape[-1], - dataType=str(out_dtype.name), + numInputVectors=out_folded_shape[:-1], + NumChannels=padded_outstream_bytes, + dataType="UINT8", intfWidth=intfwidth, - streamWidth=streamWidth, + streamWidth=padded_outstream_width, direction="out", domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", @@ -123,31 +130,38 @@ class InsertIODMA(Transformation): ), "Data layout of input tensor must be NHWC or NC" in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) + first_node_inst = getCustomOp(first_node) + in_folded_shape = first_node_inst.get_folded_input_shape() + # take advantage of AXI stream width padding for DMA alignment + # (AXI streams are always padded to 8 bits) + # this is the width of stream output expected from the DMA + padded_instream_width = first_node_inst.get_instream_width_padded() + padded_instream_bytes = padded_instream_width // 8 # determine the feasible interface width - transfer_bits = np.prod(in_shape) * in_dtype.bitwidth() + transfer_bits = padded_instream_width * np.prod(out_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0 ), "No feasible interface width for transfer size" - # get width of stream output from DMA - streamWidth = getCustomOp(first_node).get_instream_width() # make new buffer first_node_in = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape ) model.graph.value_info.append(first_node_in) model.set_tensor_datatype(first_node_in.name, in_dtype) - # reroute final node output to final_node_out_name + # reroute first node input + # FIXME: currently always using 8-bit dtypes to work around the + # padding problems for i/o DMA first_node.input[0] = first_node_in.name dma_node = oh.make_node( "IODMA", [graph_in_name], [first_node_in.name], - numInputVectors=in_shape[:-1], - NumChannels=in_shape[-1], - dataType=str(in_dtype.name), + numInputVectors=in_folded_shape[:-1], + NumChannels=padded_instream_bytes, + dataType="UINT8", intfWidth=intfwidth, - streamWidth=streamWidth, + streamWidth=padded_instream_width, direction="in", domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py deleted file mode 100644 index c9b6e18938586bcc48eee19583bdedabb052b821..0000000000000000000000000000000000000000 --- a/src/finn/util/vivado.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -from finn.util.basic import launch_process_helper - - -def which(program): - "Python equivalent of the shell cmd 'which'." - - # source: - # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python - def is_exe(fpath): - return os.path.isfile(fpath) and os.access(fpath, os.X_OK) - - fpath, fname = os.path.split(program) - if fpath: - if is_exe(program): - return program - else: - for path in os.environ["PATH"].split(os.pathsep): - exe_file = os.path.join(path, program) - if is_exe(exe_file): - return exe_file - - return None - - -def out_of_context_synth( - verilog_dir, - top_name, - fpga_part="xczu3eg-sbva484-1-e", - clk_name="ap_clk_0", - clk_period_ns=5.0, -): - "Run out-of-context Vivado synthesis, return resources and slack." - - # ensure that the OH_MY_XILINX envvar is set - if "OHMYXILINX" not in os.environ: - raise Exception("The environment variable OHMYXILINX is not defined.") - # ensure that vivado is in PATH: source $VIVADO_PATH/settings64.sh - if which("vivado") is None: - raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.") - omx_path = os.environ["OHMYXILINX"] - script = "vivadocompile.sh" - # vivadocompile.sh <top-level-entity> <clock-name (optional)> <fpga-part (optional)> - call_omx = "zsh %s/%s %s %s %s %f" % ( - omx_path, - script, - top_name, - clk_name, - fpga_part, - float(clk_period_ns), - ) - call_omx = call_omx.split() - launch_process_helper(call_omx, proc_env=os.environ.copy(), cwd=verilog_dir) - - vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name) - res_counts_path = vivado_proj_folder + "/res.txt" - - with open(res_counts_path, "r") as myfile: - res_data = myfile.read().split("\n") - ret = {} - ret["vivado_proj_folder"] = vivado_proj_folder - for res_line in res_data: - res_fields = res_line.split("=") - print(res_fields) - try: - ret[res_fields[0]] = float(res_fields[1]) - except ValueError: - ret[res_fields[0]] = 0 - except IndexError: - ret[res_fields[0]] = 0 - if ret["WNS"] == 0: - ret["fmax_mhz"] = 0 - else: - ret["fmax_mhz"] = 1000.0 / (clk_period_ns - ret["WNS"]) - return ret