Skip to content
Snippets Groups Projects
Commit fe592d5e authored by Georg Streich's avatar Georg Streich
Browse files

First steps to supporting accl

parent 1bcf6d31
No related branches found
No related tags found
No related merge requests found
Showing
with 20448 additions and 245 deletions
template<unsigned int accl_width, unsigned int stream_width, unsigned int count>
void accl_out(
unsigned int destination,
ap_uint<32> comm_adr,
ap_uint<32> dpcfg_adr,
STREAM<command_word> &cmd_to_cclo,
STREAM<command_word> &sts_from_cclo,
STREAM<stream_word> &data_to_cclo,
STREAM<stream_word> &data_from_cclo,
hls::stream<ap_uint<stream_width>> &in
) {
#pragma HLS INTERFACE axis port=cmd_to_cclo
#pragma HLS INTERFACE axis port=sts_from_cclo
#pragma HLS INTERFACE axis port=data_to_cclo
#pragma HLS INTERFACE axis port=data_from_cclo
#pragma HLS INTERFACE axis port=in
accl_hls::ACCLCommand accl(cmd_to_cclo, sts_from_cclo, comm_adr, dpcfg_adr, 0, 3);
accl_hls::ACCLData data(data_to_cclo, data_from_cclo);
ap_uint<512> accl_word;
ap_uint<stream_width> stream_word;
int num_bits = count * accl_width;
int step = std::gcd(accl_width, stream_width);
for (int i = 0; i < num_bits - step + 1; num_bits += step) {
if (i % stream_width == 0) {
stream_word = in.read();
}
int ni = i + step;
accl_word(i % accl_width, ni % accl_width) =
stream_word(i % stream_width, ni % stream_width);
if (ni % accl_width == 0) {
data.push(accl_word, 0);
}
}
accl.stream_put(num_bits / 32, 9, destination, 0);
}
template<unsigned int accl_width, unsigned int stream_width, unsigned int count>
void accl_in(
unsigned int destination,
ap_uint<32> comm_adr,
ap_uint<32> dpcfg_adr,
STREAM<command_word> &cmd_to_cclo,
STREAM<command_word> &sts_from_cclo,
STREAM<stream_word> &data_to_cclo,
STREAM<stream_word> &data_from_cclo,
hls::stream<ap_uint<stream_width>> &out
) {
#pragma HLS INTERFACE axis port=cmd_to_cclo
#pragma HLS INTERFACE axis port=sts_from_cclo
#pragma HLS INTERFACE axis port=data_to_cclo
#pragma HLS INTERFACE axis port=data_from_cclo
#pragma HLS INTERFACE axis port=out
accl_hls::ACCLCommand accl(cmd_to_cclo, sts_from_cclo, comm_adr, dpcfg_adr, 0, 3);
accl_hls::ACCLData data(data_to_cclo, data_from_cclo);
ap_uint<512> accl_word;
ap_uint<stream_width> stream_word;
int num_bits = count * accl_width;
int step = std::gcd(accl_width, stream_width);
for (int i = 0; i < num_bits - step + 1; num_bits += step) {
if (i % accl_width == 0) {
accl_word = data.pull().data;
}
int ni = i + step;
stream_word(i % stream_width, ni % stream_width) =
accl_word(i % accl_width, ni % accl_width);
if (ni % accl_width == 0) {
out.write(stream_word);
}
}
}
......@@ -61,6 +61,7 @@ RUN apt-get update && \
python3 \
python-is-python3 \
python3-pip
RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
RUN locale-gen "en_US.UTF-8"
......@@ -116,6 +117,17 @@ RUN pip install tokenize-rt==4.2.1
# pyverilator
RUN pip install tclwrapper==0.0.1
# install accl dependencies
RUN apt-get install -y
cmake
libjsoncpp-dev
libtclap-dev
libopenmpi-dev
xvfb
RUN git clone https://github.com/zeromq/zmqpp
RUN cd zmqpp && make && make install
# extra environment variables for FINN compiler
ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"
......@@ -123,5 +135,6 @@ COPY docker/finn_entrypoint.sh /usr/local/bin/
COPY docker/quicktest.sh /usr/local/bin/
RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
RUN chmod 755 /usr/local/bin/quicktest.sh
ENTRYPOINT ["finn_entrypoint.sh"]
CMD ["bash"]
......@@ -57,6 +57,7 @@ recho () {
# qonnx (using workaround for https://github.com/pypa/pip/issues/7953)
# to be fixed in future Ubuntu versions (https://bugs.launchpad.net/ubuntu/+source/setuptools/+bug/1994016)
pip install --no-build-isolation --no-warn-script-location -e ${FINN_ROOT}/deps/qonnx
# finn-experimental
pip install --user -e ${FINN_ROOT}/deps/finn-experimental
# brevitas
......
%% Cell type:markdown id: tags:
# Verify Exported ONNX Model in FINN
<font color="red">**Live FINN tutorial:** We recommend clicking **Cell -> Run All** when you start reading this notebook for "latency hiding".</font>
**Important: This notebook depends on the 1-train-mlp-with-brevitas notebook, because we are using the ONNX model that was exported there. So please make sure the needed .onnx file is generated before you run this notebook.**
**Also remember to 'close and halt' any other FINN notebooks, since Netron visualizations use the same port.**
In this notebook we will show how to import the network we trained in Brevitas and verify it in the FINN compiler.
This verification process can actually be done at various stages in the compiler [as explained in this notebook](../bnn-pynq/tfc_end2end_verification.ipynb) but for this example we'll only consider the first step: verifying the exported high-level FINN-ONNX model.
Another goal of this notebook is to introduce you to the concept of *graph transformations* -- we'll be applying some transformations to the graph to make it executable for verification.
Once this model is sucessfully verified, we'll generate an FPGA accelerator from it in the next notebook.
%% Cell type:code id: tags:
``` python
import onnx
import onnx -
import torch
```
%% Cell type:markdown id: tags:
**This is important -- always import onnx before torch**. This is a workaround for a [known bug](https://github.com/onnx/onnx/issues/2394).
%% Cell type:markdown id: tags:
## Outline
-------------
1. [Import model into FINN with ModelWrapper](#brevitas_import_visualization)
2. [Network preparations: Tidy-up transformations](#network_preparations)
3. [Load the dataset and Brevitas model](#load_dataset)
4. [Compare FINN and Brevitas execution](#compare_brevitas)
%% Cell type:markdown id: tags:
# 1. Import model into FINN with ModelWrapper <a id="brevitas_import_visualization"></a>
Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model.
%% Cell type:code id: tags:
``` python
import os
from qonnx.core.modelwrapper import ModelWrapper
model_dir = os.environ['FINN_ROOT'] + "/notebooks/end2end_example/cybersecurity"
ready_model_filename = model_dir + "/cybsec-mlp-ready.onnx"
model_for_sim = ModelWrapper(ready_model_filename)
```
%% Output
/home/streichg/ACCL/finn/deps/qonnx/src/qonnx/core/modelwrapper.py:93: UserWarning: Some old-style domain attributes were automatically converted to new-style,
i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>
warnings.warn(
%% Cell type:markdown id: tags:
Let's have a look at some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it.
%% Cell type:code id: tags:
``` python
dir(model_for_sim)
```
%% Output
['__class__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__le__',
'__lt__',
'__module__',
'__ne__',
'__new__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__str__',
'__subclasshook__',
'__weakref__',
'_model_proto',
'analysis',
'check_all_tensor_shapes_specified',
'check_compatibility',
'cleanup',
'find_consumer',
'find_consumers',
'find_direct_predecessors',
'find_direct_successors',
'find_producer',
'find_upstream',
'fix_float64',
'get_all_tensor_names',
'get_finn_nodes',
'get_initializer',
'get_metadata_prop',
'get_node_index',
'get_nodes_by_op_type',
'get_non_finn_nodes',
'get_tensor_datatype',
'get_tensor_fanout',
'get_tensor_layout',
'get_tensor_shape',
'get_tensor_sparsity',
'get_tensor_valueinfo',
'graph',
'is_fork_node',
'is_join_node',
'make_empty_exec_context',
'make_new_valueinfo_name',
'model',
'rename_tensor',
'save',
'set_initializer',
'set_metadata_prop',
'set_tensor_datatype',
'set_tensor_layout',
'set_tensor_shape',
'set_tensor_sparsity',
'temporary_fix_oldstyle_domain',
'transform']
%% Cell type:markdown id: tags:
Many of these helper functions relate to extracting information about the structure and properties of the ONNX model. You can find out more about examining and manipulating ONNX models programmatically in [this tutorial](../../basics/0_how_to_work_with_onnx.ipynb), but we'll show a few basic functions here. For instance, we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node.
%% Cell type:code id: tags:
``` python
from qonnx.core.datatype import DataType
finnonnx_in_tensor_name = model_for_sim.graph.input[0].name
finnonnx_out_tensor_name = model_for_sim.graph.output[0].name
print("Input tensor name: %s" % finnonnx_in_tensor_name)
print("Output tensor name: %s" % finnonnx_out_tensor_name)
finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)
finnonnx_model_out_shape = model_for_sim.get_tensor_shape(finnonnx_out_tensor_name)
print("Input tensor shape: %s" % str(finnonnx_model_in_shape))
print("Output tensor shape: %s" % str(finnonnx_model_out_shape))
finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)
finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)
print("Input tensor datatype: %s" % str(finnonnx_model_in_dt.name))
print("Output tensor datatype: %s" % str(finnonnx_model_out_dt.name))
print("List of node operator types in the graph: ")
print([x.op_type for x in model_for_sim.graph.node])
```
%% Output
Input tensor name: 0
Output tensor name: 73
Input tensor shape: [1, 600]
Output tensor shape: [1, 1]
Input tensor datatype: BIPOLAR
Output tensor datatype: FLOAT32
List of node operator types in the graph:
['Mul', 'Add', 'Div', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'MultiThreshold']
%% Cell type:markdown id: tags:
Note that the output tensor is (as of yet) marked as a float32 value, even though we know the output is binary. This will be automatically inferred by the compiler in the next step when we run the `InferDataTypes` transformation.
%% Cell type:markdown id: tags:
# 2. Network preparation: Tidy-up transformations <a id="network_preparations"></a>
Before running the verification, we need to prepare our FINN-ONNX model. In particular, all the intermediate tensors need to have statically defined shapes. To do this, we apply some graph transformations to the model like a kind of "tidy-up" to make it easier to process.
**Graph transformations in FINN.** The whole FINN compiler is built around the idea of transformations, which gradually transform the model into a synthesizable hardware description. Although FINN offers functionality that automatically calls a standard sequence of transformations (covered in the next notebook), you can also manually call individual transformations (like we do here), as well as adding your own transformations, to create custom flows. You can read more about these transformations in [this notebook](../bnn-pynq/tfc_end2end_example.ipynb).
%% Cell type:code id: tags:
``` python
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.infer_datatypes import InferDataTypes
from qonnx.transformation.fold_constants import FoldConstants
model_for_sim = model_for_sim.transform(InferShapes())
model_for_sim = model_for_sim.transform(FoldConstants())
model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())
model_for_sim = model_for_sim.transform(GiveReadableTensorNames())
model_for_sim = model_for_sim.transform(InferDataTypes())
model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())
verif_model_filename = model_dir + "/cybsec-mlp-verification.onnx"
model_for_sim.save(verif_model_filename)
```
%% Cell type:markdown id: tags:
**Would the FINN compiler still work if we didn't do this?** The compilation step in the next notebook applies these transformations internally and would work fine, but we're going to use FINN's verification capabilities below and these require the tidy-up transformations.
%% Cell type:markdown id: tags:
Let's view our ready-to-go model after the transformations. Note that all intermediate tensors now have their shapes specified (indicated by numbers next to the arrows going between layers). Additionally, the datatype inference step has propagated quantization annotations to the outputs of `MultiThreshold` layers (expand by clicking the + next to the name of the tensor to see the quantization annotation) and the final output tensor.
%% Cell type:code id: tags:
``` python
from finn.util.visualization import showInNetron
showInNetron(verif_model_filename)
```
%% Output
Serving 'cybsec-mlp-verification.onnx' at http://0.0.0.0:8081
<IPython.lib.display.IFrame at 0x7fdcd2beb100>
%% Cell type:markdown id: tags:
# 3. Load the Dataset and the Brevitas Model <a id="load_dataset"></a>
We'll use some example data from the quantized UNSW-NB15 dataset (from the previous notebook) to use as inputs for the verification.
%% Cell type:code id: tags:
``` python
import numpy as np
from torch.utils.data import TensorDataset
def get_preqnt_dataset(data_dir: str, train: bool):
unsw_nb15_data = np.load(data_dir + "/unsw_nb15_binarized.npz")
if train:
partition = "train"
else:
partition = "test"
part_data = unsw_nb15_data[partition].astype(np.float32)
part_data = torch.from_numpy(part_data)
part_data_in = part_data[:, :-1]
part_data_out = part_data[:, -1]
return TensorDataset(part_data_in, part_data_out)
n_verification_inputs = 100
test_quantized_dataset = get_preqnt_dataset(".", False)
input_tensor = test_quantized_dataset.tensors[0][:n_verification_inputs]
input_tensor.shape
```
%% Output
torch.Size([100, 593])
%% Cell type:markdown id: tags:
Let's also bring up the MLP we trained in Brevitas from the previous notebook. We'll compare its outputs to what is generated by FINN.
%% Cell type:code id: tags:
``` python
input_size = 593
hidden1 = 64
hidden2 = 64
hidden3 = 64
weight_bit_width = 2
act_bit_width = 2
num_classes = 1
from brevitas.nn import QuantLinear, QuantReLU
import torch.nn as nn
brevitas_model = nn.Sequential(
QuantLinear(input_size, hidden1, bias=True, weight_bit_width=weight_bit_width),
nn.BatchNorm1d(hidden1),
nn.Dropout(0.5),
QuantReLU(bit_width=act_bit_width),
QuantLinear(hidden1, hidden2, bias=True, weight_bit_width=weight_bit_width),
nn.BatchNorm1d(hidden2),
nn.Dropout(0.5),
QuantReLU(bit_width=act_bit_width),
QuantLinear(hidden2, hidden3, bias=True, weight_bit_width=weight_bit_width),
nn.BatchNorm1d(hidden3),
nn.Dropout(0.5),
QuantReLU(bit_width=act_bit_width),
QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width)
)
# replace this with your trained network checkpoint if you're not
# using the pretrained weights
trained_state_dict = torch.load(model_dir + "/state_dict.pth")["models_state_dict"][0]
# Uncomment the following line if you previously chose to train the network yourself
#trained_state_dict = torch.load("state_dict_self-trained.pth")
brevitas_model.load_state_dict(trained_state_dict, strict=False)
```
%% Output
<All keys matched successfully>
%% Cell type:code id: tags:
``` python
def inference_with_brevitas(current_inp):
brevitas_output = brevitas_model.forward(current_inp)
# apply sigmoid + threshold
brevitas_output = torch.sigmoid(brevitas_output)
brevitas_output = (brevitas_output.detach().numpy() > 0.5) * 1
# convert output to bipolar
brevitas_output = 2*brevitas_output - 1
return brevitas_output
```
%% Cell type:markdown id: tags:
# 4. Compare FINN & Brevitas execution <a id="compare_brevitas"></a>
%% Cell type:markdown id: tags:
Let's make helper functions to execute the same input with Brevitas and FINN. For FINN, we'll use the [`finn.core.onnx_exec`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#finn.core.onnx_exec.execute_onnx) function to execute the exported FINN-ONNX on the inputs. Note that this ONNX execution is for verification only; not for accelerated execution.
Recall that the quantized values from the dataset are 593-bit binary {0, 1} vectors whereas our exported model takes 600-bit bipolar {-1, +1} vectors, so we'll have to preprocess it a bit before we can use it for verifying the ONNX model.
%% Cell type:code id: tags:
``` python
import finn.core.onnx_exec as oxe
def inference_with_finn_onnx(current_inp):
finnonnx_in_tensor_name = model_for_sim.graph.input[0].name
finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)
finnonnx_out_tensor_name = model_for_sim.graph.output[0].name
# convert input to numpy for FINN
current_inp = current_inp.detach().numpy()
# add padding and re-scale to bipolar
current_inp = np.pad(current_inp, [(0, 0), (0, 7)])
current_inp = 2*current_inp-1
# reshape to expected input (add 1 for batch dimension)
current_inp = current_inp.reshape(finnonnx_model_in_shape)
# create the input dictionary
input_dict = {finnonnx_in_tensor_name : current_inp}
# run with FINN's execute_onnx
output_dict = oxe.execute_onnx(model_for_sim, input_dict)
#get the output tensor
finn_output = output_dict[finnonnx_out_tensor_name]
return finn_output
```
%% Cell type:markdown id: tags:
Now we can call our inference helper functions for each input and compare the outputs.
%% Cell type:code id: tags:
``` python
import numpy as np
from tqdm import trange
verify_range = trange(n_verification_inputs, desc="FINN execution", position=0, leave=True)
brevitas_model.eval()
ok = 0
nok = 0
for i in verify_range:
# run in Brevitas with PyTorch tensor
current_inp = input_tensor[i].reshape((1, 593))
brevitas_output = inference_with_brevitas(current_inp)
finn_output = inference_with_finn_onnx(current_inp)
# compare the outputs
ok += 1 if finn_output == brevitas_output else 0
nok += 1 if finn_output != brevitas_output else 0
verify_range.set_description("ok %d nok %d" % (ok, nok))
verify_range.refresh()
```
%% Output
ok 100 nok 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00, 5.30it/s]
%% Cell type:code id: tags:
``` python
try:
assert ok == n_verification_inputs
print("Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical")
except AssertionError:
assert False, "Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical"
```
%% Output
Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical
%% Cell type:markdown id: tags:
This concludes our second notebook. In the next one, we'll take the ONNX model we just verified all the way down to FPGA hardware with the FINN compiler.
......
%% Cell type:markdown id: tags:
# Building the Streaming Dataflow Accelerator
<font color="red">**Live FINN tutorial:** We recommend clicking **Cell -> Run All** when you start reading this notebook for "latency hiding".</font>
**Important: This notebook depends on the 1-train-mlp-with-brevitas notebook because we are using models that were created by that notebook. So please make sure the needed .onnx files are generated prior to running this notebook.**
<img align="left" src="finn-example.png" alt="drawing" style="margin-right: 20px" width="250"/>
In this notebook, we'll use the FINN compiler generate an FPGA accelerator with a streaming dataflow architecture from our quantized MLP for the cybersecurity task. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vivado HLS description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.
These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput.
%% Cell type:markdown id: tags:
## Outline
-------------
1. [Introduction to `build_dataflow` Tool](#intro_build_dataflow)
2. [Understanding the Build Configuration: `DataflowBuildConfig`](#underst_build_conf)
2.1.[Output Products](#output_prod)
2.2.[Configuring the Board and FPGA Part](#config_fpga)
2.3 [Configuring the Performance](#config_perf)
4. [Launch a Build: Only Estimate Reports](#build_estimate_report)
5. [Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance](#build_ip_synth_rtlsim)
6. [(Optional) Launch a Build: PYNQ Bitfile and Driver](#build_bitfile_driver)
7. [(Optional) Run on PYNQ board](#run_on_pynq)
%% Cell type:markdown id: tags:
## Introduction to `build_dataflow` Tool <a id="intro_build_dataflow"></a>
Since version 0.5b, the FINN compiler has a `build_dataflow` tool. Compared to previous versions which required setting up all the needed transformations in a Python script, it makes experimenting with dataflow architecture generation easier. The core idea is to specify the relevant build info as a configuration `dict`, which invokes all the necessary steps to make the dataflow build happen. It can be invoked either from the [command line](https://finn-dev.readthedocs.io/en/latest/command_line.html) or with a single Python function call.
In this notebook, we'll use the Python function call to invoke the builds to stay inside the Jupyter notebook, but feel free to experiment with reproducing what we do here with the `./run-docker.sh build_dataflow` and `./run-docker.sh build_custom` command-line entry points too.
%% Cell type:markdown id: tags:
## Understanding the Build Configuration: `DataflowBuildConfig` <a id="underst_build_conf"></a>
The build configuration is specified by an instance of `finn.builder.build_dataflow_config.DataflowBuildConfig`. The configuration is a Python [`dataclass`](https://docs.python.org/3/library/dataclasses.html) which can be serialized into or de-serialized from JSON files for persistence, although we'll just set it up in Python here.
There are many options in the configuration to customize different aspects of the build, we'll only cover a few of them in this notebook. You can read the details on all the config options on [the FINN API documentation](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowBuildConfig).
Let's go over some of the members of the `DataflowBuildConfig`:
### Output Products <a id="output_prod"></a>
The build can produce many different outputs, and some of them can take a long time (e.g. bitfile synthesis for a large network). When you first start working on generating a new accelerator and exploring the different performance options, you may not want to go all the way to a bitfile. Thus, in the beginning you may just select the estimate reports as the output products. Gradually, you can generate the output products from later stages until you are happy enough with the design to build the full accelerator integrated into a shell.
The output products are controlled by:
* `generate_outputs`: list of output products (of type [`finn.builder.build_dataflow_config.DataflowOutputType`](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowOutputType)) that will be generated by the build. Some available options are:
- `ESTIMATE_REPORTS` : report expected resources and performance per layer and for the whole network without any synthesis
- `STITCHED_IP` : create a stream-in stream-out IP design that can be integrated into other Vivado IPI or RTL designs
- `RTLSIM_PERFORMANCE` : use PyVerilator to do a performance/latency test of the `STITCHED_IP` design
- `OOC_SYNTH` : run out-of-context synthesis (just the accelerator itself, without any system surrounding it) on the `STITCHED_IP` design to get post-synthesis FPGA resources and achievable clock frequency
- `BITFILE` : integrate the accelerator into a shell to produce a standalone bitfile
- `PYNQ_DRIVER` : generate a PYNQ Python driver that can be used to launch the accelerator
- `DEPLOYMENT_PACKAGE` : create a folder with the `BITFILE` and `PYNQ_DRIVER` outputs, ready to be copied to the target FPGA platform.
* `output_dir`: the directory where all the generated build outputs above will be written into.
* `steps`: list of predefined (or custom) build steps FINN will go through. Use `build_dataflow_config.estimate_only_dataflow_steps` to execute only the steps needed for estimation (without any synthesis), and the `build_dataflow_config.default_build_dataflow_steps` otherwise (which is the default value). You can find the list of default steps [here](https://finn.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.default_build_dataflow_steps) in the documentation.
### Configuring the Board and FPGA Part <a id="config_fpga"></a>
* `fpga_part`: Xilinx FPGA part to be used for synthesis, can be left unspecified to be inferred from `board` below, or specified explicitly for e.g. out-of-context synthesis.
* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/basic.py#L41) for a list of possible boards.
* `shell_flow_type`: the target [shell flow type](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.ShellFlowType), only needed for generating full bitfiles where the FINN design is integrated into a shell (so only needed if `BITFILE` is selected)
### Configuring the Performance <a id="config_perf"></a>
You can configure the performance (and correspondingly, the FPGA resource footprint) of the generated dataflow accelerator in two ways:
1) (basic) Set a target performance and let the compiler figure out the per-node parallelization settings.
2) (advanced) Specify a separate .json as `folding_config_file` that lists the degree of parallelization (as well as other hardware options) for each layer.
This notebook only deals with the basic approach, for which you need to set up:
* `target_fps`: target inference performance in frames per second. Note that target may not be achievable due to specific layer constraints, or due to resource limitations of the FPGA.
* `synth_clk_period_ns`: target clock frequency (in nanoseconds) for Vivado synthesis. e.g. `synth_clk_period_ns=5.0` will target a 200 MHz clock. Note that the target clock period may not be achievable depending on the FPGA part and design complexity.
%% Cell type:markdown id: tags:
## Launch a Build: Only Estimate Reports <a id="build_estimate_report"></a>
First, we'll launch a build that only generates the estimate reports, which does not require any synthesis. Note two things below: how the `generate_outputs` only contains `ESTIMATE_REPORTS`, but also how the `steps` uses a value of `estimate_only_dataflow_steps`. This skips steps like HLS synthesis to provide a quick estimate from analytical models.
%% Cell type:code id: tags:
``` python
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil
model_dir = os.environ['FINN_ROOT'] + "/notebooks/end2end_example/cybersecurity"
model_file = model_dir + "/cybsec-mlp-ready.onnx"
estimates_output_dir = "output_estimates_only"
#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
shutil.rmtree(estimates_output_dir)
print("Previous run results deleted!")
cfg_estimates = build.DataflowBuildConfig(
output_dir = estimates_output_dir,
mvau_wwidth_max = 80,
target_fps = 1000000,
synth_clk_period_ns = 10.0,
fpga_part = "xc7z020clg400-1",
steps = build_cfg.estimate_only_dataflow_steps,
generate_outputs=[
build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
]
)
```
%% Output
Previous run results deleted!
%% Cell type:code id: tags:
``` python
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)
```
%% Output
Building dataflow accelerator from cybsec-mlp-ready.onnx
Intermediate outputs will be generated in /tmp/finn_dev_streichg
Final outputs will be generated in output_estimates_only
Build log is at output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/8]
Running step: step_tidy_up [2/8]
Running step: step_streamline [3/8]
/home/streichg/ACCL/finn/deps/qonnx/src/qonnx/core/modelwrapper.py:93: UserWarning: Some old-style domain attributes were automatically converted to new-style,
i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>
warnings.warn(
Running step: step_convert_to_hls [4/8]
Running step: step_create_dataflow_partition [5/8]
Running step: step_target_fps_parallelization [6/8]
Running step: step_apply_folding_config [7/8]
Running step: step_generate_estimate_reports [8/8]
Completed successfully
CPU times: user 1.03 s, sys: 1.34 s, total: 2.37 s
Wall time: 631 ms
0
%% Cell type:code id: tags:
``` python
assert os.path.exists(estimates_output_dir + "/report/estimate_network_performance.json")
```
%% Cell type:markdown id: tags:
We'll now examine the generated outputs from this build. If we look under the outputs directory, we'll find a subfolder with the generated estimate reports.
%% Cell type:code id: tags:
``` python
! ls {estimates_output_dir}
```
%% Output
auto_folding_config.json intermediate_models time_per_step.json
build_dataflow.log report
%% Cell type:code id: tags:
``` python
! ls {estimates_output_dir}/report
```
%% Output
estimate_layer_config_alternatives.json estimate_network_performance.json
estimate_layer_cycles.json op_and_param_counts.json
estimate_layer_resources.json
%% Cell type:markdown id: tags:
We see that various reports have been generated as .json files. Let's examine the contents of the `estimate_network_performance.json` for starters. Here, we can see the analytical estimates for the performance and latency.
%% Cell type:code id: tags:
``` python
! cat {estimates_output_dir}/report/estimate_network_performance.json
```
%% Output
{
"critical_path_cycles": 252,
"max_cycles": 64,
"max_cycles_node_name": "MatrixVectorActivation_1",
"estimated_throughput_fps": 1562500.0,
"estimated_latency_ns": 2520.0
}
%% Cell type:markdown id: tags:
Since all of these reports are .json files, we can easily load them into Python for further processing. This can be useful if you are building your own design automation tools on top of FINN. Let's define a helper function and look at the `estimate_layer_cycles.json` report.
%% Cell type:code id: tags:
``` python
import json
def read_json_dict(filename):
with open(filename, "r") as f:
ret = json.load(f)
return ret
```
%% Cell type:code id: tags:
``` python
read_json_dict(estimates_output_dir + "/report/estimate_layer_cycles.json")
```
%% Output
{'MatrixVectorActivation_0': 60,
'MatrixVectorActivation_1': 64,
'MatrixVectorActivation_2': 64,
'MatrixVectorActivation_3': 64}
%% Cell type:markdown id: tags:
Here, we can see the estimated number of clock cycles each layer will take. Recall that all of these layers will be running in parallel, and the slowest layer will determine the overall throughput of the entire neural network. FINN attempts to parallelize each layer such that they all take a similar number of cycles, and less than the corresponding number of cycles that would be required to meet `target_fps`. Additionally by summing up all layer cycle estimates one can obtain an estimate for the overall latency of the whole network.
Finally, we can see the layer-by-layer resource estimates in the `estimate_layer_resources.json` report:
%% Cell type:code id: tags:
``` python
read_json_dict(estimates_output_dir + "/report/estimate_layer_resources.json")
```
%% Output
{'MatrixVectorActivation_0': {'BRAM_18K': 36,
'BRAM_efficiency': 0.11574074074074074,
'LUT': 8184,
'URAM': 0,
'URAM_efficiency': 1,
'DSP': 0},
'MatrixVectorActivation_1': {'BRAM_18K': 4,
'BRAM_efficiency': 0.1111111111111111,
'LUT': 1217,
'URAM': 0,
'URAM_efficiency': 1,
'DSP': 0},
'MatrixVectorActivation_2': {'BRAM_18K': 4,
'BRAM_efficiency': 0.1111111111111111,
'LUT': 1217,
'URAM': 0,
'URAM_efficiency': 1,
'DSP': 0},
'MatrixVectorActivation_3': {'BRAM_18K': 1,
'BRAM_efficiency': 0.006944444444444444,
'LUT': 341,
'URAM': 0,
'URAM_efficiency': 1,
'DSP': 0},
'total': {'BRAM_18K': 45.0, 'LUT': 10959.0, 'URAM': 0.0, 'DSP': 0.0}}
%% Cell type:markdown id: tags:
This particular report is useful to determine whether the current configuration will fit into a particular FPGA. If you see that the resource requirements are too high for the FPGA you had in mind, you should consider lowering the `target_fps`.
**Note that the analytical models tend to over-estimate how much resources are needed, since they can't capture the effects of various synthesis optimizations.**
%% Cell type:markdown id: tags:
## Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance <a id="build_ip_synth_rtlsim"></a>
Once we have a configuration that gives satisfactory estimates, we can move on to generating the accelerator. We can do this in different ways depending on how we want to integrate the accelerator into a larger system. For instance, if we have a larger streaming system built in Vivado or if we'd like to re-use this generated accelerator as an IP component in other projects, the `STITCHED_IP` output product is a good choice. We can also use the `OOC_SYNTH` output product to get post-synthesis resource and clock frequency numbers for our accelerator.
<font color="red">**Live FINN tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on **(your AWS URL):6080/vnc.html**
* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MatrixVectorActivation_XXXXXX`
* Once the `step_create_stitched_ip [11/16]` below is completed, you can view the generated stitched IP in Vivado under `/home/ubuntu/finn/notebooks/end2end_example/cybersecurity/output_ipstitch_ooc_rtlsim/stitched_ip`
</font>
%% Cell type:code id: tags:
``` python
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil
model_file = model_dir + "/cybsec-mlp-ready.onnx"
rtlsim_output_dir = "output_ipstitch_ooc_rtlsim"
#Delete previous run results if exist
if os.path.exists(rtlsim_output_dir):
shutil.rmtree(rtlsim_output_dir)
print("Previous run results deleted!")
cfg_stitched_ip = build.DataflowBuildConfig(
output_dir = rtlsim_output_dir,
mvau_wwidth_max = 80,
target_fps = 1000000,
synth_clk_period_ns = 10.0,
fpga_part = "xc7z020clg400-1",
generate_outputs=[
build_cfg.DataflowOutputType.STITCHED_IP,
build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
build_cfg.DataflowOutputType.OOC_SYNTH,
]
)
```
%% Output
Previous run results deleted!
%% Cell type:code id: tags:
``` python
%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)
```
%% Output
Building dataflow accelerator from cybsec-mlp-ready.onnx
Intermediate outputs will be generated in /tmp/finn_dev_streichg
Final outputs will be generated in output_ipstitch_ooc_rtlsim
Build log is at output_ipstitch_ooc_rtlsim/build_dataflow.log
Running step: step_qonnx_to_finn [1/17]
Running step: step_tidy_up [2/17]
Running step: step_streamline [3/17]
Running step: step_convert_to_hls [4/17]
Running step: step_create_dataflow_partition [5/17]
Running step: step_target_fps_parallelization [6/17]
Running step: step_apply_folding_config [7/17]
Running step: step_generate_estimate_reports [8/17]
Running step: step_hls_codegen [9/17]
Running step: step_hls_ipgen [10/17]
Running step: step_set_fifo_depths [11/17]
Running step: step_create_stitched_ip [12/17]
Running step: step_measure_rtlsim_performance [13/17]
Running step: step_out_of_context_synthesis [14/17]
Running step: step_synthesize_bitfile [15/17]
Running step: step_make_pynq_driver [16/17]
Running step: step_deployment_package [17/17]
Completed successfully
CPU times: user 2.37 s, sys: 1.07 s, total: 3.44 s
Wall time: 4min 58s
0
%% Cell type:code id: tags:
``` python
assert os.path.exists(rtlsim_output_dir + "/report/ooc_synth_and_timing.json")
assert os.path.exists(rtlsim_output_dir + "/report/rtlsim_performance.json")
assert os.path.exists(rtlsim_output_dir + "/final_hw_config.json")
```
%% Cell type:markdown id: tags:
Why is e.g. `step_synthesize_bitfile` listed above even though we didn't ask for a bitfile in the output products? This is because we're using the default set of build steps, which includes `step_synthesize_bitfile`. Since its output product is not selected, this step will do nothing.
%% Cell type:markdown id: tags:
Among the output products, we will find the accelerator exported as a stitched IP block design:
%% Cell type:code id: tags:
``` python
! ls {rtlsim_output_dir}/stitched_ip
```
%% Output
all_verilog_srcs.txt finn_vivado_stitch_proj.xpr
data ip
finn_vivado_stitch_proj.cache make_project.sh
finn_vivado_stitch_proj.gen make_project.tcl
finn_vivado_stitch_proj.hw vivado.jou
finn_vivado_stitch_proj.ip_user_files vivado.log
finn_vivado_stitch_proj.srcs
%% Cell type:markdown id: tags:
We also have a few reports generated by these output products, different from the ones generated by `ESTIMATE_REPORTS`.
%% Cell type:code id: tags:
``` python
! ls {rtlsim_output_dir}/report
```
%% Output
estimate_layer_resources_hls.json rtlsim_performance.json
ooc_synth_and_timing.json
%% Cell type:markdown id: tags:
In `ooc_synth_and_timing.json` we can find the post-synthesis and maximum clock frequency estimate for the accelerator. Note that the clock frequency estimate here tends to be optimistic, since out-of-context synthesis is less constrained.
%% Cell type:code id: tags:
``` python
! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json
```
%% Output
{
"vivado_proj_folder": "/tmp/finn_dev_streichg/synth_out_of_context_3tfop3vi/results_finn_design_wrapper",
"LUT": 7226.0,
"LUTRAM": 44.0,
"FF": 8561.0,
"DSP": 0.0,
"BRAM": 22.0,
"BRAM_18K": 0.0,
"BRAM_36K": 22.0,
"URAM": 0.0,
"Carry": 308.0,
"WNS": 0.747,
"Delay": 0.747,
"vivado_version": 2022.1,
"vivado_build_no": 3526262.0,
"": 0,
"fmax_mhz": 108.07305738679347,
"estimated_throughput_fps": 1688641.5216686479
}
%% Cell type:markdown id: tags:
In `rtlsim_performance.json` we can find the steady-state throughput and latency for the accelerator, as obtained by rtlsim. If the DRAM bandwidth numbers reported here are below what the hardware platform is capable of (i.e. the accelerator is not memory-bound), you can expect the same steady-state throughput (excluding any software/driver overheads) in real hardware.
%% Cell type:code id: tags:
``` python
! cat {rtlsim_output_dir}/report/rtlsim_performance.json
```
%% Output
{
"N_IN_TXNS": 15,
"N_OUT_TXNS": 1,
"cycles": 212,
"N": 1,
"latency_cycles": 212,
"runtime[ms]": 0.00212,
"throughput[images/s]": 471698.11320754717,
"fclk[mhz]": 100.0
}
%% Cell type:markdown id: tags:
Finally, let's have a look at `final_hw_config.json`. This is the node-by-node hardware configuration determined by the FINN compiler, including FIFO depths, parallelization settings (PE/SIMD) and others. If you want to optimize your build further (the "advanced" method we mentioned under "Configuring the performance"), you can use this .json file as the `folding_config_file` for a new run to use it as a starting point for further exploration and optimizations.
%% Cell type:code id: tags:
``` python
! cat {rtlsim_output_dir}/final_hw_config.json
```
%% Output
{
"Defaults": {},
"StreamingFIFO_0": {
"ram_style": "auto",
"depth": 32,
"impl_style": "rtl",
"inFIFODepths": [
0
],
"outFIFODepths": [
0
]
},
"MatrixVectorActivation_0": {
"PE": 16,
"SIMD": 40,
"ram_style": "auto",
"resType": "lut",
"mem_mode": "decoupled",
"runtime_writeable_weights": 0,
"inFIFODepths": [
32
],
"outFIFODepths": [
0
]
},
"StreamingDataWidthConverter_Batch_0": {
"impl_style": "vivado",
"inFIFODepths": [
0
],
"outFIFODepths": [
0
]
},
"MatrixVectorActivation_1": {
"PE": 1,
"SIMD": 64,
"ram_style": "auto",
"resType": "lut",
"mem_mode": "decoupled",
"runtime_writeable_weights": 0,
"inFIFODepths": [
0
],
"outFIFODepths": [
0
]
},
"StreamingDataWidthConverter_Batch_1": {
"impl_style": "hls",
"inFIFODepths": [
0
],
"outFIFODepths": [
0
]
},
"MatrixVectorActivation_2": {
"PE": 1,
"SIMD": 64,
"ram_style": "auto",
"resType": "lut",
"mem_mode": "decoupled",
"runtime_writeable_weights": 0,
"inFIFODepths": [
0
],
"outFIFODepths": [
0
]
},
"MatrixVectorActivation_3": {
"PE": 1,
"SIMD": 1,
"ram_style": "auto",
"resType": "lut",
"mem_mode": "decoupled",
"runtime_writeable_weights": 0,
"inFIFODepths": [
0
],
"outFIFODepths": [
0
]
}
}
%% Cell type:markdown id: tags:
## (Optional) Launch a Build: PYNQ Bitfile and Driver <a id="build_bitfile_driver"></a>
<font color="red">**Live FINN tutorial:** This section is not included in the hands-on tutorial due to the bitfile synthesis time (15-20 min). If you own a PYNQ board, we encourage you to uncomment the cells below to try it out on your own after the tutorial.</font>
%% Cell type:code id: tags:
``` python
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil
model_file = model_dir + "/cybsec-mlp-ready.onnx"
final_output_dir = "output_final"
#Delete previous run results if exist
if os.path.exists(final_output_dir):
shutil.rmtree(final_output_dir)
print("Previous run results deleted!")
cfg = build.DataflowBuildConfig(
output_dir = final_output_dir,
mvau_wwidth_max = 80,
target_fps = 1000000,
synth_clk_period_ns = 10.0,
board = "Pynq-Z1",
shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,
generate_outputs=[
build_cfg.DataflowOutputType.BITFILE,
build_cfg.DataflowOutputType.PYNQ_DRIVER,
build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
]
)
```
%% Cell type:code id: tags:
``` python
#%%time
#build.build_dataflow_cfg(model_file, cfg)
```
%% Cell type:markdown id: tags:
For our final build, the output products include the bitfile (and the accompanying .hwh file, also needed to execute correctly on PYNQ for Zynq platforms):
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/bitfile
```
%% Cell type:markdown id: tags:
The generated Python driver lets us execute the accelerator on PYNQ platforms with simply numpy i/o. You can find some notebooks showing how to use FINN-generated accelerators at runtime in the [finn-examples](https://github.com/Xilinx/finn-examples) repository.
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/driver
```
%% Cell type:markdown id: tags:
The reports folder contains the post-synthesis resource and timing reports:
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/report
```
%% Cell type:markdown id: tags:
Finally, we have the `deploy` folder which contains everything you need to copy onto the target board to get the accelerator running:
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/deploy
```
%% Cell type:markdown id: tags:
## (Optional) Run on PYNQ board <a id="run_on_pynq"></a>
<font color="red">**Live FINN tutorial:** This section is not included in the hands-on tutorial due to the bitfile synthesis time (15-20 min) of the previous section. If you own a PYNQ board, we encourage you to uncomment the cells below to try it out on your own after the tutorial.</font>
To test the accelerator on the board, we'll put a copy of the dataset and a premade Python script that validates the accuracy into the `driver` folder, then make a zip archive of the whole deployment folder.
%% Cell type:code id: tags:
``` python
#! cp unsw_nb15_binarized.npz {final_output_dir}/deploy/driver
```
%% Cell type:code id: tags:
``` python
#! cp validate-unsw-nb15.py {final_output_dir}/deploy/driver
```
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/deploy/driver
```
%% Cell type:code id: tags:
``` python
#from shutil import make_archive
#make_archive('deploy-on-pynq', 'zip', final_output_dir+"/deploy")
```
%% Cell type:markdown id: tags:
You can now download the created zipfile (**File -> Open**, mark the checkbox next to the `deploy-on-pynq.zip` and select Download from the toolbar), then copy it to your PYNQ board (for instance via `scp` or `rsync`). Then, run the following commands **on the PYNQ board** to extract the archive and run the validation:
%% Cell type:markdown id: tags:
```shell
unzip deploy-on-pynq.zip -d finn-cybsec-mlp-demo
cd finn-cybsec-mlp-demo/driver
sudo python3.6 -m pip install bitstring
sudo python3.6 validate-unsw-nb15.py --batchsize 1000
```
%% Cell type:markdown id: tags:
You should see `Final accuracy: 91.868293` at the end. You may have noticed that the validation doesn't *quite* run at 1M inferences per second. This is because of the Python packing/unpacking and data movement overheads. To see this in more detail, the generated driver includes a benchmarking mode that shows the runtime breakdown:
%% Cell type:markdown id: tags:
```shell
sudo python3.6 driver.py --exec_mode throughput_test --bitfile ../bitfile/finn-accel.bit --batchsize 1000
cat nw_metrics.txt
```
%% Cell type:markdown id: tags:
```{'runtime[ms]': 1.0602474212646484,
'throughput[images/s]': 943176.0737575893,
'DRAM_in_bandwidth[Mb/s]': 70.7382055318192,
'DRAM_out_bandwidth[Mb/s]': 0.9431760737575894,
'fclk[mhz]': 100.0,
'batch_size': 1000,
'fold_input[ms]': 9.679794311523438e-05,
'pack_input[ms]': 0.060115814208984375,
'copy_input_data_to_device[ms]': 0.002428770065307617,
'copy_output_data_from_device[ms]': 0.0005249977111816406,
'unpack_output[ms]': 0.3773000240325928,
'unfold_output[ms]': 6.818771362304688e-05}```
%% Cell type:markdown id: tags:
Here, the various `pack_input/unpack_output` calls show the overhead of packing/unpacking the inputs/outputs to convert from numpy arrays to the bit-contiguous data representation our accelerator expects. The `copy_input_data_to_device` and `copy_output_data_from_device` indicate the cost of moving the data between the CPU and accelerator memories. These overheads can dominate the execution time when running with small batch sizes.
Finally, we can see that `throughput[images/s]`, which is the pure hardware throughput without any software and data movement overheads, is close to 1M inferences per second.
......
......@@ -184,7 +184,7 @@ fi
# Launch container with current directory mounted
# important to pass the --init flag here for correct Vivado operation, see:
# https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
DOCKER_EXEC="docker run -t --rm $DOCKER_INTERACTIVE --tty --init "
DOCKER_EXEC="docker run -t $DOCKER_INTERACTIVE --tty --init "
DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
DOCKER_EXEC+="-e SHELL=/bin/bash "
DOCKER_EXEC+="-w $SCRIPTPATH "
......
......@@ -62,6 +62,11 @@ setup_requires = pyscaffold>=3.2a0,<3.3a0
# tests_require = pytest; pytest-cov
# Require a specific Python version, e.g. Python 2.7 or >= 3.4
# python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
install_requires =
referencing==0.8.11
jsonschema==4.17.3
jsonschema-specifications==2022.12.3
jupyter-events==0.6.3
[options.packages.find]
where = src
......
......@@ -63,6 +63,8 @@ from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
from finn.custom_op.fpgadataflow.accl import AcclIn, AcclOut
custom_op = dict()
# make sure new HLSCustomOp subclasses are imported here so that they get
......@@ -93,3 +95,5 @@ custom_op["StreamingConcat"] = StreamingConcat
custom_op["CheckSum"] = CheckSum
custom_op["StreamingEltwise"] = StreamingEltwise
custom_op["FMPadding_rtl"] = FMPadding_rtl
custom_op["AcclIn"] = AcclIn
custom_op["AcclOut"] = AcclOut
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import math
import numpy as np
import warnings
from qonnx.core.datatype import DataType
from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
from IPython.core.debugger import set_trace
import subprocess
import os
class AcclOp(HLSCustomOp):
def get_nodeattr_types(self):
my_attrs = {
"NumChannels": ("i", True, 0),
# FINN input datatype
"dataType": ("s", True, ""),
# utilized width of accl words
"intfWidth": ("i", False, 32),
# Width of input or output stream
"streamWidth": ("i", False, 32),
# shape describing input vecs per execution
"numInputVectors": ("ints", False, [1]),
# accl specific attrs
"startPort": ("i", False, 5500),
"rank": ("i", True, 0),
"worldSize": ("i", True, 0),
"otherRank": ("i", True, 0),
}
my_attrs.update(super().get_nodeattr_types())
return my_attrs
def get_normal_input_shape(self, ind=0):
vecs = list(self.get_nodeattr("numInputVectors"))
num_ch = self.get_nodeattr("NumChannels")
ishape = tuple(vecs + [num_ch])
return ishape
def get_normal_output_shape(self, ind=0):
return self.get_normal_input_shape()
def compile_singlenode_code(self):
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
subprocess.run(["/usr/bin/cmake", f"{os.environ['FINN_ROOT']}/ACCL/test/model/bfm"],
cwd=code_gen_dir)
subprocess.run(["make"], cwd=code_gen_dir)
self.set_nodeattr("executable_path", code_gen_dir + "/bin/node_model")
def get_number_output_values(self):
oshape = self.get_normal_output_shape()
itype_bits = self.get_input_datatype().bitwidth()
stream_width = self.get_nodeattr("streamWidth")
nelems = np.prod(oshape)
nbits = nelems * itype_bits
assert (
nbits % stream_width == 0
), "DMA: total transfer size must be word multiple"
ovalues = nbits // stream_width
return ovalues
def make_shape_compatible_op(self, model):
exp_ishape = self.get_normal_input_shape()
oshape = self.get_normal_output_shape()
ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
assert ishape == exp_ishape, "Unexpected input shape."
return super().make_const_shape_op(oshape)
def infer_node_datatype(self, model):
node = self.onnx_node
idt = model.get_tensor_datatype(node.input[0])
if idt != self.get_input_datatype():
warn_str = "inputDataType changing for %s: %s -> %s " % (
node.name,
str(self.get_input_datatype()),
str(idt),
)
warnings.warn(warn_str)
self.set_nodeattr("dataType", idt.name)
model.set_tensor_datatype(node.output[0], idt)
def get_input_datatype(self, ind=0):
"""Returns FINN DataType of input."""
return DataType[self.get_nodeattr("dataType")]
def get_output_datatype(self, ind=0):
"""Returns FINN DataType of output. (Same as input datatype)"""
return self.get_input_datatype()
def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = [
'#include <accl_hls.h>',
'#include "cclo_bfm.h"',
'#include "accl_funcs.hpp"',
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
'#pragma HLS INTERFACE axis port=cmd_to_cclo',
'#pragma HLS INTERFACE axis port=sts_from_cclo',
'#pragma HLS INTERFACE axis port=data_to_cclo',
'#pragma HLS INTERFACE axis port=data_from_cclo',
'#pragma HLS INTERFACE axis port=stream',
]
def strm_decl(self):
start_port = self.get_nodeattr("startPort")
rank = self.get_nodeattr("rank")
world_size = self.get_nodeattr("worldSize")
dest = self.get_nodeattr("worldSize")
self.code_gen_dict["$STREAMDECLARATIONS$"] = [
'hlslib::Stream<command_word> cmd_to_cclo("cmd_to_cclo"), sts_from_cclo("sts_from_cclo");',
'hlslib::Stream<stream_word, 512> data_from_cclo("data_from_cclo"), data_to_cclo("data_to_cclo");',
'hls::stream<ap_uint<{}>> stream;'.format(self.get_nodeattr("streamWidth")),
'std::vector<unsigned int> dest{9};',
'CCLO_BFM cclo({}, {}, {}, dest, cmd_to_cclo, sts_from_cclo, data_from_cclo, data_to_cclo); cclo.run();'.format(start_port, rank, world_size, dest),
]
def defines(self, mode):
self.code_gen_dict["$DEFINES$"] = ['']
def verify_node(self):
...
class AcclOut(AcclOp):
def get_instream_width(self, ind=0):
return self.get_nodeattr("streamWidth")
def get_outstream_width(self, ind=0):
return self.get_nodeattr("intfWidth")
def get_folded_output_shape(self, ind=0):
shape = list(self.get_normal_output_shape())
itype_bits = self.get_output_datatype().bitwidth()
intfw = self.get_nodeattr("streamWidth")
assert (
intfw % itype_bits == 0
), "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
shape[-1] = fold_depth
shape.append(elems_per_word)
return tuple(shape)
def docompute(self):
intf_width = self.get_nodeattr("intfWidth")
stream_width = self.get_nodeattr("streamWidth")
fold = self.get_folded_output_shape()[-1]
self.code_gen_dict["$DOCOMPUTE$"] = [
'accl_out<{}, {}, {}>({}, {}, {}, cmd_to_cclo, sts_from_cclo, data_to_cclo, data_from_cclo, stream);'.format(intf_width, stream_width, fold, 0, 0, 0)
]
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
node = self.onnx_node
if mode != "cppsim":
raise Exception(
"""Invalid value for attribute exec_mode! Is currently set to: {}
has to be set to one of the following value ("cppsim")""".format(
mode
)
)
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
assert (
str(context[node.input[0]].dtype) == "float32"
), """Input datatype is
not float32 as expected."""
expected_inp_shape = self.get_folded_output_shape()
expected_inp_shape = (*expected_inp_shape[:-1], expected_inp_shape[-1] * self.get_input_datatype().bitwidth())
reshaped_input = context[node.input[0]].reshape(expected_inp_shape)
if self.get_input_datatype() == DataType["BIPOLAR"]:
# store bipolar activations as binary
reshaped_input = (reshaped_input + 1) / 2
export_idt = DataType["BINARY"]
else:
export_idt = self.get_input_datatype()
# make copy before saving the array
reshaped_input = reshaped_input.copy()
np.save(
os.path.join(code_gen_dir, "input.npy"),
reshaped_input,
)
super().exec_precompiled_singlenode_model()
def read_npy_data(self):
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
dtype = self.get_input_datatype()
elem_bits = dtype.bitwidth()
packed_bits = self.get_instream_width()
packed_hls_type = "ap_uint<%d>" % packed_bits
elem_hls_type = dtype.get_hls_datatype_str()
npy_type = "float"
npy_in = "%s/input.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"] = []
# note: the innermost dim is reversed for the input
self.code_gen_dict["$READNPYDATA$"].append(
'npy2apintstream<%s, %s, %d, %s>("%s", stream, false);'
% (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
)
def save_as_npy(self):
self.code_gen_dict["$SAVEASCNPY$"] = []
def dataoutstrm(self):
self.code_gen_dict["$DATAOUTSTREAM$"] = ['']
def blackboxfunction(self):
pass
class AcclIn(AcclOp):
def get_instream_width(self, ind=0):
return self.get_nodeattr("intfWidth")
def get_outstream_width(self, ind=0):
return self.get_nodeattr("streamWidth")
def get_folded_input_shape(self, ind=0):
shape = list(self.get_normal_input_shape())
itype_bits = self.get_input_datatype().bitwidth()
intfw = self.get_nodeattr("streamWidth")
assert (
intfw % itype_bits == 0
), "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
shape[-1] = fold_depth
shape.append(elems_per_word)
return tuple(shape)
def docompute(self):
intf_width = self.get_nodeattr("intfWidth")
stream_width = self.get_nodeattr("streamWidth")
fold = self.get_folded_input_shape()[-1]
self.code_gen_dict["$DOCOMPUTE$"] = [
'accl_in<{}, {}, {}>({}, {}, {}, cmd_to_cclo, sts_from_cclo, data_to_cclo, data_from_cclo, stream);'.format(intf_width, stream_width, fold, 0, 0, 0)
]
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
if mode != "cppsim":
raise Exception(
"""Invalid value for attribute exec_mode! Is currently set to: {}
has to be set to one of the following value ("cppsim")""".format(
mode
)
)
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
super().exec_precompiled_singlenode_model()
super().npy_to_dynamic_output(context)
if self.get_output_datatype() == DataType["BIPOLAR"]:
out = context[node.output[0]]
out = 2 * out - 1
context[node.output[0]] = out
oshape = self.get_normal_output_shape()
assert (
context[node.output[0]].shape == oshape
), """Output shape is not as expected"""
def read_npy_data(self):
self.code_gen_dict["$READNPYDATA$"] = ['']
def save_as_npy(self):
self.code_gen_dict["$SAVEASCNPY$"] = ['']
def dataoutstrm(self):
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
dtype = self.get_output_datatype()
if dtype == DataType["BIPOLAR"]:
# use binary for bipolar storage
dtype = DataType["BINARY"]
elem_bits = dtype.bitwidth()
packed_bits = self.get_outstream_width()
packed_hls_type = "ap_uint<%d>" % packed_bits
elem_hls_type = dtype.get_hls_datatype_str()
npy_type = "float"
npy_out = "%s/output.npy" % code_gen_dir
shape = self.get_folded_input_shape()
shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
self.code_gen_dict["$DATAOUTSTREAM$"] = [
'apintstream2npy<%s, %s, %d, %s>(stream, %s, "%s", false);'
% (
packed_hls_type,
elem_hls_type,
elem_bits,
npy_type,
shape_cpp_str,
npy_out,
)
]
def blackboxfunction(self):
pass
......@@ -33,6 +33,7 @@ from finn.core.onnx_exec import execute_onnx
# TODO move StreamingDataflowPartition to HLSCustomOp base class
from IPython.core.debugger import set_trace
class StreamingDataflowPartition(CustomOp):
"""Class that corresponds to the meta/container node StreamingDataflowPartition
......@@ -71,6 +72,7 @@ class StreamingDataflowPartition(CustomOp):
if old_iname != new_iname:
inp_ctx[new_iname] = inp_ctx[old_iname]
del inp_ctx[old_iname]
ret = execute_onnx(model, inp_ctx, return_full_exec_context)
# outputs may have been renamed in partition
for i, node_oname in enumerate(node.output):
......
......@@ -43,13 +43,15 @@ class CreateDataflowPartition(Transformation):
that indicates the filename for the second graph that only contains
dataflow nodes. No action is taken if there are no dataflow nodes."""
def __init__(self, partition_model_dir=None):
def __init__(self, partition_model_dir=None, num_devices=1):
super().__init__()
if partition_model_dir is None:
self.partition_model_dir = make_build_dir("dataflow_partition_")
else:
self.partition_model_dir = partition_model_dir
self.num_devices = num_devices
def apply(self, model):
def filter_fc_extw(x):
if x.op_type == "IODMA":
......@@ -62,6 +64,23 @@ class CreateDataflowPartition(Transformation):
if len(extw_dma_nodes) > 0:
model = model.transform(ExternalizeParams())
node_stats = dict()
def compute_node_stats(node):
if node.name not in node_stats:
num_nodes_up_to = 1
predecessors = model.find_direct_predecessors(node)
if predecessors:
for pred in predecessors:
compute_node_stats(pred)
num_nodes_up_to += node_stats[pred.name]
node_stats[node.name] = num_nodes_up_to
for node in model.graph.node:
compute_node_stats(node)
total_nodes = max(node_stats.values())
def assign_partition_id(node):
if node.op_type in ["GenericPartition", "StreamingDataflowPartition"]:
return -1
......@@ -72,7 +91,7 @@ class CreateDataflowPartition(Transformation):
if assigned_partition is not None:
return assigned_partition.i
else:
return 0
return int(node_stats[node.name] / (total_nodes + 1) * self.num_devices)
else:
return -1
......@@ -115,4 +134,9 @@ class CreateDataflowPartition(Transformation):
new_p_node_inst.set_nodeattr("slr", slr)
new_p_node_inst.set_nodeattr("mem_port", mem_port)
p_model.set_metadata_prop("accl_world_size", str(self.num_devices))
p_model.set_metadata_prop("accl_rank", str(partition_ind))
p_model.save(node_model_filename)
return (parent_model, False)
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import math
import numpy as np
from onnx import TensorProto
from onnx import helper as oh
from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.base import Transformation
from qonnx.transformation.general import SortGraph
from qonnx.util.basic import get_by_name
class InsertAccl(Transformation):
def __init__(self, max_intfwidth=512):
self.max_intfwidth = 512
def apply(self, model):
modified = False
# only makes sense for a pure fpgadataflow graph -- so we check!
all_nodes = list(model.graph.node)
assert all(
get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
for x in all_nodes
)
world_size = int(model.get_metadata_prop("accl_world_size"))
rank = int(model.get_metadata_prop("accl_rank"))
insert_input = rank > 0
insert_output = rank < world_size - 1
if insert_input:
graph_in_names = [x.name for x in model.graph.input]
for graph_in_name in graph_in_names:
first_node = model.find_consumer(graph_in_name)
if first_node.op_type == "AcclIn":
continue
else:
in_shape = model.get_tensor_shape(graph_in_name)
in_dtype = model.get_tensor_datatype(graph_in_name)
first_node_inst = getCustomOp(first_node)
in_folded_shape = first_node_inst.get_folded_input_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream output expected from the DMA
padded_instream_width = first_node_inst.get_instream_width_padded()
padded_instream_bytes = padded_instream_width // 8
# determine the feasible interface width
transfer_bits = padded_instream_width * np.prod(
in_folded_shape[:-1]
)
# make new buffer
first_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
)
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
model.graph.value_info.append(first_node_in)
model.set_tensor_datatype(first_node_in.name, in_dtype)
# reroute first node input
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
first_node.input[0] = first_node_in.name
accl_node = oh.make_node(
"AcclIn",
[graph_in_name],
[first_node_in.name],
numInputVectors=in_folded_shape[:-1],
NumChannels=padded_instream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_instream_width,
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
rank=rank,
worldSize=world_size,
otherRank=rank-1,
)
model.graph.node.insert(0, accl_node)
modified = True
if insert_output:
graph_out_names = [x.name for x in model.graph.output]
for graph_out_name in graph_out_names:
final_node = model.find_producer(graph_out_name)
if final_node.op_type == "AcclOut":
continue
else:
out_shape = model.get_tensor_shape(graph_out_name)
out_dtype = model.get_tensor_datatype(graph_out_name)
final_node_inst = getCustomOp(final_node)
out_folded_shape = final_node_inst.get_folded_output_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream input to DMA
padded_outstream_width = (
final_node_inst.get_outstream_width_padded()
)
padded_outstream_bytes = padded_outstream_width // 8
# determine the feasible interface width
transfer_bits = padded_outstream_width * np.prod(
out_folded_shape[:-1]
)
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# make new buffer
final_node_out = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
)
model.graph.value_info.append(final_node_out)
model.set_tensor_datatype(final_node_out.name, out_dtype)
# reroute final node output to final_node_out_name
final_node.output[0] = final_node_out.name
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
dma_node = oh.make_node(
"AcclOut",
[final_node_out.name],
[graph_out_name],
numInputVectors=out_folded_shape[:-1],
NumChannels=padded_outstream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_outstream_width,
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
rank=rank,
worldSize=world_size,
otherRank=rank+1,
)
model.graph.node.append(dma_node)
modified = True
if modified:
model = model.transform(SortGraph())
return (model, modified)
......@@ -106,7 +106,7 @@ class InsertIODMA(Transformation):
graph_in_names = [x.name for x in model.graph.input]
for graph_in_name in graph_in_names:
first_node = model.find_consumer(graph_in_name)
if first_node.op_type == "IODMA":
if first_node.p_type in ["IODMA", "Accl"]:
# IODMA already inserted for this input
continue
else:
......@@ -153,7 +153,7 @@ class InsertIODMA(Transformation):
graph_out_names = [x.name for x in model.graph.output]
for graph_out_name in graph_out_names:
final_node = model.find_producer(graph_out_name)
if final_node.op_type == "IODMA":
if final_node.p_type in ["IODMA", "Accl"]:
continue
else:
out_shape = model.get_tensor_shape(graph_out_name)
......
......@@ -176,11 +176,14 @@ class CppBuilder:
with open(self.compile_script, "w") as f:
f.write("#!/bin/bash \n")
f.write(bash_compile + "\n")
bash_command = ["bash", self.compile_script]
process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
process_compile.communicate()
def launch_process_helper(args, proc_env=None, cwd=None):
"""Helper function to launch a process in a way that facilitates logging
stdout/stderr with Python loggers.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment