Skip to content
Snippets Groups Projects
Commit 966cb4f4 authored by Georg Streich's avatar Georg Streich
Browse files

Enable IpGen for ACCL nodes

parent fca31d8a
No related branches found
No related tags found
No related merge requests found
Showing
with 336 additions and 139 deletions
#include <iostream>
#include <memory>
#include <accl.hpp>
#include <accl_network_utils.hpp>
#include "cclo_bfm.h"
std::unique_ptr<ACCL::ACCL> init_accl(
unsigned int world_size,
unsigned int rank,
unsigned int start_port
) {
accl_network_utils::acclDesign design = accl_network_utils::acclDesign::AXIS3x;
std::vector<ACCL::rank_t> ranks;
// TODO: Get the rxbuf size as a config parameter
ranks = accl_network_utils::generate_ranks(true, rank, world_size, start_port, 16 * 1024);
return accl_network_utils::initialize_accl(ranks, rank, true, design);
}
std::unique_ptr<CCLO_BFM> init_cclo_and_wait_for_input(
unsigned int zmqport,
unsigned int rank,
unsigned int world_size,
const std::vector<unsigned int> &dest,
hlslib::Stream<command_word> &cmd_to_cclo,
hlslib::Stream<command_word> &sts_from_cclo,
hlslib::Stream<stream_word> &data_from_cclo,
hlslib::Stream<stream_word> &data_to_cclo
) {
auto cclo = std::make_unique<CCLO_BFM>(zmqport, rank, world_size, dest,
cmd_to_cclo, sts_from_cclo, data_from_cclo, data_to_cclo);
cclo->run();
// Makeshift barrier
std::cout << "CCLO BFM started" << std::endl;
std::string inp;
std::cin >> inp;
return cclo;
}
const size_t accl_width = 512;
template<unsigned int stream_width, unsigned int num_bits>
template<unsigned int stream_width, unsigned int num_bits, unsigned int step>
void accl_out(
unsigned int destination,
ap_uint<32> comm_adr,
......@@ -67,9 +23,9 @@ void accl_out(
ap_uint<accl_width> accl_word;
ap_uint<stream_width> stream_word;
#ifdef CPPSIM
std::cerr << "accl_out starting to output data to rank " << destination << " (" << num_bits << " bits)" << std::endl;
int step = std::gcd(accl_width, stream_width);
#endif
for (int i = 0; i < num_bits - step + 1; i += step) {
if (i % stream_width == 0) {
......@@ -87,19 +43,20 @@ void accl_out(
}
bool leftover = num_bits % accl_width != 0;
int num_transferred_bits = num_bits + leftover ? accl_width : 0;
int num_transfer_bits = ((num_bits + accl_width - 1) / accl_width) * accl_width;
if (num_bits < num_transferred_bits) {
if (num_bits < num_transfer_bits) {
data.push(accl_word, 0);
}
#ifdef CPPSIM
std::cerr << "accl_out calling accl" << std::endl;
accl.stream_put(num_transferred_bits / 32, 9, destination, (ap_uint<64>)&accl_word);
#endif
std::cerr << "accl_out finished" << std::endl;
accl.stream_put(num_transfer_bits / 32, 9, destination, 0);
}
template<unsigned int stream_width, unsigned int num_bits>
template<unsigned int stream_width, unsigned int num_bits, unsigned int step>
void accl_in(
unsigned int source,
STREAM<stream_word> &data_from_cclo,
......@@ -108,16 +65,19 @@ void accl_in(
#pragma HLS INTERFACE axis port=data_from_cclo
#pragma HLS INTERFACE axis port=out
STREAM<stream_word> data_to_cclo;
accl_hls::ACCLData data(data_to_cclo, data_from_cclo);
ap_uint<accl_width> accl_word;
ap_uint<stream_width> stream_word;
#ifdef CPPSIM
std::cerr << "accl_in starting to receive data from rank " << source << " (" << num_bits << " bits)" << std::endl;
int step = std::gcd(accl_width, stream_width);
#endif
for (int i = 0; i < num_bits - step + 1; i += step) {
if (i % accl_width == 0) {
accl_word = data_from_cclo.read().data;
accl_word = data.pull().data;
}
int ni = i + step - 1;
......@@ -126,10 +86,11 @@ void accl_in(
accl_word(ni % accl_width, i % accl_width);
if ((ni + 1) % stream_width == 0) {
std::cerr << "accl_in writing to stream" << std::endl;
out.write(stream_word);
}
}
#ifdef CPPSIM
std::cerr << "accl_in finished" << std::endl;
#endif
}
#include <iostream>
#include <memory>
#include <accl.hpp>
#include <accl_network_utils.hpp>
#include "cclo_bfm.h"
std::unique_ptr<ACCL::ACCL> init_accl(
unsigned int world_size,
unsigned int rank,
unsigned int start_port
) {
accl_network_utils::acclDesign design = accl_network_utils::acclDesign::AXIS3x;
std::vector<ACCL::rank_t> ranks;
// TODO: Get the rxbuf size as a config parameter
ranks = accl_network_utils::generate_ranks(true, rank, world_size, start_port, 16 * 1024);
return accl_network_utils::initialize_accl(ranks, rank, true, design);
}
std::unique_ptr<CCLO_BFM> init_cclo_and_wait_for_input(
unsigned int zmqport,
unsigned int rank,
unsigned int world_size,
hlslib::Stream<command_word> &cmd_to_cclo,
hlslib::Stream<command_word> &sts_from_cclo,
hlslib::Stream<stream_word> &data_from_cclo,
hlslib::Stream<stream_word> &data_to_cclo
) {
std::vector<unsigned int> dest{9};
auto cclo = std::make_unique<CCLO_BFM>(zmqport, rank, world_size, dest,
cmd_to_cclo, sts_from_cclo, data_from_cclo, data_to_cclo);
cclo->run();
// Makeshift barrier
std::cout << "CCLO BFM started" << std::endl;
std::string inp;
std::cin >> inp;
return cclo;
}
%% Cell type:markdown id: tags:
# Building the Streaming Dataflow Accelerator
<font color="red">**Live FINN tutorial:** We recommend clicking **Cell -> Run All** when you start reading this notebook for "latency hiding".</font>
**Important: This notebook depends on the 1-train-mlp-with-brevitas notebook because we are using models that were created by that notebook. So please make sure the needed .onnx files are generated prior to running this notebook.**
<img align="left" src="finn-example.png" alt="drawing" style="margin-right: 20px" width="250"/>
In this notebook, we'll use the FINN compiler generate an FPGA accelerator with a streaming dataflow architecture from our quantized MLP for the cybersecurity task. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vivado HLS description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.
These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput.
%% Cell type:markdown id: tags:
## Outline
-------------
1. [Introduction to `build_dataflow` Tool](#intro_build_dataflow)
2. [Understanding the Build Configuration: `DataflowBuildConfig`](#underst_build_conf)
2.1.[Output Products](#output_prod)
2.2.[Configuring the Board and FPGA Part](#config_fpga)
2.3 [Configuring the Performance](#config_perf)
4. [Launch a Build: Only Estimate Reports](#build_estimate_report)
5. [Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance](#build_ip_synth_rtlsim)
6. [(Optional) Launch a Build: PYNQ Bitfile and Driver](#build_bitfile_driver)
7. [(Optional) Run on PYNQ board](#run_on_pynq)
%% Cell type:markdown id: tags:
## Introduction to `build_dataflow` Tool <a id="intro_build_dataflow"></a>
Since version 0.5b, the FINN compiler has a `build_dataflow` tool. Compared to previous versions which required setting up all the needed transformations in a Python script, it makes experimenting with dataflow architecture generation easier. The core idea is to specify the relevant build info as a configuration `dict`, which invokes all the necessary steps to make the dataflow build happen. It can be invoked either from the [command line](https://finn-dev.readthedocs.io/en/latest/command_line.html) or with a single Python function call.
In this notebook, we'll use the Python function call to invoke the builds to stay inside the Jupyter notebook, but feel free to experiment with reproducing what we do here with the `./run-docker.sh build_dataflow` and `./run-docker.sh build_custom` command-line entry points too.
%% Cell type:markdown id: tags:
## Understanding the Build Configuration: `DataflowBuildConfig` <a id="underst_build_conf"></a>
The build configuration is specified by an instance of `finn.builder.build_dataflow_config.DataflowBuildConfig`. The configuration is a Python [`dataclass`](https://docs.python.org/3/library/dataclasses.html) which can be serialized into or de-serialized from JSON files for persistence, although we'll just set it up in Python here.
There are many options in the configuration to customize different aspects of the build, we'll only cover a few of them in this notebook. You can read the details on all the config options on [the FINN API documentation](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowBuildConfig).
Let's go over some of the members of the `DataflowBuildConfig`:
### Output Products <a id="output_prod"></a>
The build can produce many different outputs, and some of them can take a long time (e.g. bitfile synthesis for a large network). When you first start working on generating a new accelerator and exploring the different performance options, you may not want to go all the way to a bitfile. Thus, in the beginning you may just select the estimate reports as the output products. Gradually, you can generate the output products from later stages until you are happy enough with the design to build the full accelerator integrated into a shell.
The output products are controlled by:
* `generate_outputs`: list of output products (of type [`finn.builder.build_dataflow_config.DataflowOutputType`](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowOutputType)) that will be generated by the build. Some available options are:
- `ESTIMATE_REPORTS` : report expected resources and performance per layer and for the whole network without any synthesis
- `STITCHED_IP` : create a stream-in stream-out IP design that can be integrated into other Vivado IPI or RTL designs
- `RTLSIM_PERFORMANCE` : use PyVerilator to do a performance/latency test of the `STITCHED_IP` design
- `OOC_SYNTH` : run out-of-context synthesis (just the accelerator itself, without any system surrounding it) on the `STITCHED_IP` design to get post-synthesis FPGA resources and achievable clock frequency
- `BITFILE` : integrate the accelerator into a shell to produce a standalone bitfile
- `PYNQ_DRIVER` : generate a PYNQ Python driver that can be used to launch the accelerator
- `DEPLOYMENT_PACKAGE` : create a folder with the `BITFILE` and `PYNQ_DRIVER` outputs, ready to be copied to the target FPGA platform.
* `output_dir`: the directory where all the generated build outputs above will be written into.
* `steps`: list of predefined (or custom) build steps FINN will go through. Use `build_dataflow_config.estimate_only_dataflow_steps` to execute only the steps needed for estimation (without any synthesis), and the `build_dataflow_config.default_build_dataflow_steps` otherwise (which is the default value). You can find the list of default steps [here](https://finn.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.default_build_dataflow_steps) in the documentation.
### Configuring the Board and FPGA Part <a id="config_fpga"></a>
* `fpga_part`: Xilinx FPGA part to be used for synthesis, can be left unspecified to be inferred from `board` below, or specified explicitly for e.g. out-of-context synthesis.
* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/basic.py#L41) for a list of possible boards.
* `shell_flow_type`: the target [shell flow type](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.ShellFlowType), only needed for generating full bitfiles where the FINN design is integrated into a shell (so only needed if `BITFILE` is selected)
### Configuring the Performance <a id="config_perf"></a>
You can configure the performance (and correspondingly, the FPGA resource footprint) of the generated dataflow accelerator in two ways:
1) (basic) Set a target performance and let the compiler figure out the per-node parallelization settings.
2) (advanced) Specify a separate .json as `folding_config_file` that lists the degree of parallelization (as well as other hardware options) for each layer.
This notebook only deals with the basic approach, for which you need to set up:
* `target_fps`: target inference performance in frames per second. Note that target may not be achievable due to specific layer constraints, or due to resource limitations of the FPGA.
* `synth_clk_period_ns`: target clock frequency (in nanoseconds) for Vivado synthesis. e.g. `synth_clk_period_ns=5.0` will target a 200 MHz clock. Note that the target clock period may not be achievable depending on the FPGA part and design complexity.
%% Cell type:markdown id: tags:
## Launch a Build: Only Estimate Reports <a id="build_estimate_report"></a>
First, we'll launch a build that only generates the estimate reports, which does not require any synthesis. Note two things below: how the `generate_outputs` only contains `ESTIMATE_REPORTS`, but also how the `steps` uses a value of `estimate_only_dataflow_steps`. This skips steps like HLS synthesis to provide a quick estimate from analytical models.
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
```
%% Cell type:code id: tags:
``` python
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil
model_dir = os.environ['FINN_ROOT'] + "/notebooks/end2end_example/cybersecurity"
model_file = model_dir + "/cybsec-mlp-ready.onnx"
estimates_output_dir = "output_estimates_only"
#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
shutil.rmtree(estimates_output_dir)
print("Previous run results deleted!")
cfg_estimates = build.DataflowBuildConfig(
verbose = True,
output_dir = estimates_output_dir,
mvau_wwidth_max = 80,
target_fps = 1000000,
synth_clk_period_ns = 10.0,
fpga_part = "xc7z020clg400-1",
steps = build_cfg.estimate_only_dataflow_steps,
generate_outputs=[
build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
],
board = 'U250',
num_boards = 2,
save_intermediate_models = True,
)
```
%% Output
Previous run results deleted!
%% Cell type:code id: tags:
``` python
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)
```
%% Output
Building dataflow accelerator from /home/streichg/finn/notebooks/end2end_example/cybersecurity/cybsec-mlp-ready.onnx
Intermediate outputs will be generated in /tmp/finn_dev_streichg
Final outputs will be generated in output_estimates_only
Build log is at output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/12]
Running step: step_tidy_up [2/12]
Running step: step_streamline [3/12]
/home/streichg/finn/deps/qonnx/src/qonnx/transformation/infer_data_layouts.py:124: UserWarning: Assuming 2D input is NC
warnings.warn("Assuming 2D input is NC")
Running step: step_convert_to_hls [4/12]
Running step: step_create_dataflow_partition [5/12]
Running step: step_distribute_dataflow [6/12]
Welcome to the CBC MILP Solver
Version: devel
Build Date: Nov 15 2020
Starting solution of the Linear programming relaxation problem using Primal Simplex
Coin0506I Presolve 0 (-125) rows, 0 (-176) columns and 0 (-652) elements
Clp0000I Optimal - objective value 10
Coin0511I After Postsolve, objective 10, infeasibilities - dual 0 (0), primal 0 (0)
Clp0032I Optimal objective 10 - 0 iterations time 0.002, Presolve 0.00, Idiot 0.00
Starting MIP optimization
Solution:
Floorplan: device (task version) 3 (1), 7 (0), 7 (1),
Floorplan Graph:
| 0 | 1 | 2 |<-- task_nodes
--|---|---|---|
0| | | |
1| | | |
2| | | |
3| 1 | | |
4| | | |
5| | | |
6| | | |
7| | 0 | 1 |
^
|
compute_nodes
Resource results:
|LUT |FF |BRAMs |URAM |DSPs |
0| 0.00%| 0.00%| 0.00%| 0.00%| 0.00%|
1| 0.00%| 0.00%| 0.00%| 0.00%| 0.00%|
2| 0.00%| 0.00%| 0.00%| 0.00%| 0.00%|
3| 0.10%| 0.00%| 0.13%| 0.00%| 0.00%|
4| 0.00%| 0.00%| 0.00%| 0.00%| 0.00%|
5| 0.00%| 0.00%| 0.00%| 0.00%| 0.00%|
6| 0.00%| 0.00%| 0.00%| 0.00%| 0.00%|
7| 0.21%| 0.00%| 0.26%| 0.00%| 0.03%|
Max| 0.21%| 0.00%| 0.26%| 0.00%| 0.03%|
Avg| 0.04%| 0.00%| 0.05%| 0.00%| 0.00%|
*Avg: Average considering only used devices
Additional constrains:
( BRAMs + URAM + DSPs )/ 3 < 70.00%:
0| 0.00%|
1| 0.00%|
2| 0.00%|
3| 0.04%|
4| 0.00%|
5| 0.00%|
6| 0.00%|
7| 0.10%|
Connection Matrix Stats
Resource: SLL
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
0| | 0.00%| | | | | | |
1| 0.00%| | 0.00%| | | | | |
2| | 0.00%| | 0.00%| | | | |
3| | | 0.00%| | | | | |
4| | | | | | 0.00%| | |
5| | | | | 0.00%| | 0.00%| |
6| | | | | | 0.00%| | 0.00%|
7| | | | | | | 0.00%| |
Resource: Eth Mbps
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
0| | | | | | | | |
1| | | | | | | | |
2| | | | | | | | |
3| | | | | | | | 0.01%|
4| | | | | | | | |
5| | | | | | | | |
6| | | | | | | | |
7| | | | 0.00%| | | | |
Running step: step_target_fps_parallelization [7/12]
Running step: step_apply_folding_config [8/12]
Running step: step_insert_accl [9/12]
Running step: step_verify_with_cppsim [10/12]
Running step: step_minimize_bit_width [11/12]
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f952feb0>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f952fee0>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f952fdc0>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f952ff70>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f952ff10>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7fc370100>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f9548100>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f95480d0>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f952ffa0>
Running step: step_generate_estimate_reports [12/12]
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7fc372980>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f95485b0>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f9548a60>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f9548f10>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f9549330>
<finn.custom_op.fpgadataflow.accl.ACCLOut object at 0x7fc7f9549360>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f95492a0>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f9549c30>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f954a080>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f954a4d0>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f954a9b0>
<finn.custom_op.fpgadataflow.accl.ACCLIn object at 0x7fc7f954a8f0>
Completed successfully
CPU times: user 1.08 s, sys: 788 ms, total: 1.87 s
Wall time: 1.47 s
CPU times: user 1.05 s, sys: 876 ms, total: 1.93 s
Wall time: 1.53 s
/home/streichg/finn/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py:620: UserWarning: Clipping some thresholds in MatrixVectorActivation_0
warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
0
%% Cell type:code id: tags:
``` python
from finn.util.visualization import showInNetron
showInNetron(f"{estimates_output_dir}/intermediate_models/step_insert_accl.onnx")
showInNetron(f"/tmp/finn_dev_streichg/distributed_partitions_4m9vo0gq/partition_0.onnx")
```
%% Output
Serving 'output_estimates_only/intermediate_models/step_insert_accl.onnx' at http://0.0.0.0:8081
Stopping http://0.0.0.0:8081
Serving '/tmp/finn_dev_streichg/distributed_partitions_4m9vo0gq/partition_0.onnx' at http://0.0.0.0:8081
<IPython.lib.display.IFrame at 0x7f635bff5ab0>
<IPython.lib.display.IFrame at 0x7fc8f0fc3b50>
%% Cell type:code id: tags:
``` python
from finn.util.visualization import showInNetron
showInNetron(f"/tmp/finn_dev_streichg/distributed_partitions_qbgrc4tf/partition_1.onnx")
```
%% Output
Stopping http://0.0.0.0:8081
Serving '/tmp/finn_dev_streichg/distributed_partitions_qbgrc4tf/partition_1.onnx' at http://0.0.0.0:8081
<IPython.lib.display.IFrame at 0x7f5f5969bf40>
%% Cell type:code id: tags:
``` python
showInNetron("/tmp/finn_dev_streichg/distributed_partitions_y5o7qndy/partition_1.onnx")
```
%% Cell type:code id: tags:
``` python
!ls {estimates_output_dir}/intermediate_models
```
%% Output
step_apply_folding_config.onnx step_qonnx_to_finn.onnx
step_convert_to_hls.onnx step_streamline.onnx
step_create_dataflow_partition.onnx step_target_fps_parallelization.onnx
step_distribute_dataflow.onnx step_tidy_up.onnx
step_generate_estimate_reports.onnx step_verify_with_cppsim.onnx
step_insert_accl.onnx supported_op_partitions
step_minimize_bit_width.onnx
%% Cell type:code id: tags:
``` python
estimates_output_dir
```
%% Output
'output_estimates_only'
%% Cell type:code id: tags:
``` python
assert os.path.exists(estimates_output_dir + "/report/0/estimate_network_performance.json")
```
%% Output
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[5], line 1
----> 1 assert os.path.exists(estimates_output_dir + "/report/0/estimate_network_performance.json")
AssertionError:
%% Cell type:markdown id: tags:
We'll now examine the generated outputs from this build. If we look under the outputs directory, we'll find a subfolder with the generated estimate reports.
%% Cell type:code id: tags:
``` python
! ls {estimates_output_dir}/intermediate_models
```
%% Output
step_apply_folding_config.onnx step_qonnx_to_finn.onnx
step_convert_to_hls.onnx step_streamline.onnx
step_create_dataflow_partition.onnx step_target_fps_parallelization.onnx
step_distribute_dataflow.onnx step_tidy_up.onnx
step_generate_estimate_reports.onnx step_verify_with_cppsim.onnx
step_insert_accl.onnx supported_op_partitions
step_minimize_bit_width.onnx
%% Cell type:code id: tags:
``` python
! ls {estimates_output_dir}/report
```
%% Cell type:markdown id: tags:
We see that various reports have been generated as .json files. Let's examine the contents of the `estimate_network_performance.json` for starters. Here, we can see the analytical estimates for the performance and latency.
%% Cell type:code id: tags:
``` python
! cat {estimates_output_dir}/report/estimate_network_performance.json
```
%% Cell type:markdown id: tags:
Since all of these reports are .json files, we can easily load them into Python for further processing. This can be useful if you are building your own design automation tools on top of FINN. Let's define a helper function and look at the `estimate_layer_cycles.json` report.
%% Cell type:code id: tags:
``` python
import json
def read_json_dict(filename):
with open(filename, "r") as f:
ret = json.load(f)
return ret
```
%% Cell type:code id: tags:
``` python
read_json_dict(estimates_output_dir + "/report/estimate_layer_cycles.json")
```
%% Cell type:markdown id: tags:
Here, we can see the estimated number of clock cycles each layer will take. Recall that all of these layers will be running in parallel, and the slowest layer will determine the overall throughput of the entire neural network. FINN attempts to parallelize each layer such that they all take a similar number of cycles, and less than the corresponding number of cycles that would be required to meet `target_fps`. Additionally by summing up all layer cycle estimates one can obtain an estimate for the overall latency of the whole network.
Finally, we can see the layer-by-layer resource estimates in the `estimate_layer_resources.json` report:
%% Cell type:code id: tags:
``` python
read_json_dict(estimates_output_dir + "/report/estimate_layer_resources.json")
```
%% Cell type:markdown id: tags:
This particular report is useful to determine whether the current configuration will fit into a particular FPGA. If you see that the resource requirements are too high for the FPGA you had in mind, you should consider lowering the `target_fps`.
**Note that the analytical models tend to over-estimate how much resources are needed, since they can't capture the effects of various synthesis optimizations.**
%% Cell type:markdown id: tags:
## Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance <a id="build_ip_synth_rtlsim"></a>
Once we have a configuration that gives satisfactory estimates, we can move on to generating the accelerator. We can do this in different ways depending on how we want to integrate the accelerator into a larger system. For instance, if we have a larger streaming system built in Vivado or if we'd like to re-use this generated accelerator as an IP component in other projects, the `STITCHED_IP` output product is a good choice. We can also use the `OOC_SYNTH` output product to get post-synthesis resource and clock frequency numbers for our accelerator.
<font color="red">**Live FINN tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on **(your AWS URL):6080/vnc.html**
* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MatrixVectorActivation_XXXXXX`
* Once the `step_create_stitched_ip [11/16]` below is completed, you can view the generated stitched IP in Vivado under `/home/ubuntu/finn/notebooks/end2end_example/cybersecurity/output_ipstitch_ooc_rtlsim/stitched_ip`
</font>
%% Cell type:code id: tags:
``` python
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil
model_file = model_dir + "/cybsec-mlp-ready.onnx"
rtlsim_output_dir = "output_ipstitch_ooc_rtlsim"
#Delete previous run results if exist
if os.path.exists(rtlsim_output_dir):
shutil.rmtree(rtlsim_output_dir)
print("Previous run results deleted!")
cfg_stitched_ip = build.DataflowBuildConfig(
output_dir = rtlsim_output_dir,
mvau_wwidth_max = 80,
target_fps = 1000000,
synth_clk_period_ns = 10.0,
fpga_part = "xc7z020clg400-1",
generate_outputs=[
build_cfg.DataflowOutputType.STITCHED_IP,
build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
build_cfg.DataflowOutputType.OOC_SYNTH,
]
)
```
%% Cell type:code id: tags:
``` python
%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)
```
%% Cell type:code id: tags:
``` python
assert os.path.exists(rtlsim_output_dir + "/report/ooc_synth_and_timing.json")
assert os.path.exists(rtlsim_output_dir + "/report/rtlsim_performance.json")
assert os.path.exists(rtlsim_output_dir + "/final_hw_config.json")
```
%% Cell type:markdown id: tags:
Why is e.g. `step_synthesize_bitfile` listed above even though we didn't ask for a bitfile in the output products? This is because we're using the default set of build steps, which includes `step_synthesize_bitfile`. Since its output product is not selected, this step will do nothing.
%% Cell type:markdown id: tags:
Among the output products, we will find the accelerator exported as a stitched IP block design:
%% Cell type:code id: tags:
``` python
! ls {rtlsim_output_dir}/stitched_ip
```
%% Cell type:markdown id: tags:
We also have a few reports generated by these output products, different from the ones generated by `ESTIMATE_REPORTS`.
%% Cell type:code id: tags:
``` python
! ls {rtlsim_output_dir}/report
```
%% Cell type:markdown id: tags:
In `ooc_synth_and_timing.json` we can find the post-synthesis and maximum clock frequency estimate for the accelerator. Note that the clock frequency estimate here tends to be optimistic, since out-of-context synthesis is less constrained.
%% Cell type:code id: tags:
``` python
! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json
```
%% Cell type:markdown id: tags:
In `rtlsim_performance.json` we can find the steady-state throughput and latency for the accelerator, as obtained by rtlsim. If the DRAM bandwidth numbers reported here are below what the hardware platform is capable of (i.e. the accelerator is not memory-bound), you can expect the same steady-state throughput (excluding any software/driver overheads) in real hardware.
%% Cell type:code id: tags:
``` python
! cat {rtlsim_output_dir}/report/rtlsim_performance.json
```
%% Cell type:markdown id: tags:
Finally, let's have a look at `final_hw_config.json`. This is the node-by-node hardware configuration determined by the FINN compiler, including FIFO depths, parallelization settings (PE/SIMD) and others. If you want to optimize your build further (the "advanced" method we mentioned under "Configuring the performance"), you can use this .json file as the `folding_config_file` for a new run to use it as a starting point for further exploration and optimizations.
%% Cell type:code id: tags:
``` python
! cat {rtlsim_output_dir}/final_hw_config.json
```
%% Cell type:markdown id: tags:
## (Optional) Launch a Build: PYNQ Bitfile and Driver <a id="build_bitfile_driver"></a>
<font color="red">**Live FINN tutorial:** This section is not included in the hands-on tutorial due to the bitfile synthesis time (15-20 min). If you own a PYNQ board, we encourage you to uncomment the cells below to try it out on your own after the tutorial.</font>
%% Cell type:code id: tags:
``` python
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil
model_file = model_dir + "/cybsec-mlp-ready.onnx"
final_output_dir = "output_final"
#Delete previous run results if exist
if os.path.exists(final_output_dir):
shutil.rmtree(final_output_dir)
print("Previous run results deleted!")
cfg = build.DataflowBuildConfig(
output_dir = final_output_dir,
mvau_wwidth_max = 80,
target_fps = 1000000,
synth_clk_period_ns = 10.0,
board = "Pynq-Z1",
shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,
generate_outputs=[
build_cfg.DataflowOutputType.BITFILE,
build_cfg.DataflowOutputType.PYNQ_DRIVER,
build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
]
)
```
%% Cell type:code id: tags:
``` python
#%%time
#build.build_dataflow_cfg(model_file, cfg)
```
%% Cell type:markdown id: tags:
For our final build, the output products include the bitfile (and the accompanying .hwh file, also needed to execute correctly on PYNQ for Zynq platforms):
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/bitfile
```
%% Cell type:markdown id: tags:
The generated Python driver lets us execute the accelerator on PYNQ platforms with simply numpy i/o. You can find some notebooks showing how to use FINN-generated accelerators at runtime in the [finn-examples](https://github.com/Xilinx/finn-examples) repository.
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/driver
```
%% Cell type:markdown id: tags:
The reports folder contains the post-synthesis resource and timing reports:
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/report
```
%% Cell type:markdown id: tags:
Finally, we have the `deploy` folder which contains everything you need to copy onto the target board to get the accelerator running:
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/deploy
```
%% Cell type:markdown id: tags:
## (Optional) Run on PYNQ board <a id="run_on_pynq"></a>
<font color="red">**Live FINN tutorial:** This section is not included in the hands-on tutorial due to the bitfile synthesis time (15-20 min) of the previous section. If you own a PYNQ board, we encourage you to uncomment the cells below to try it out on your own after the tutorial.</font>
To test the accelerator on the board, we'll put a copy of the dataset and a premade Python script that validates the accuracy into the `driver` folder, then make a zip archive of the whole deployment folder.
%% Cell type:code id: tags:
``` python
#! cp unsw_nb15_binarized.npz {final_output_dir}/deploy/driver
```
%% Cell type:code id: tags:
``` python
#! cp validate-unsw-nb15.py {final_output_dir}/deploy/driver
```
%% Cell type:code id: tags:
``` python
#! ls {final_output_dir}/deploy/driver
```
%% Cell type:code id: tags:
``` python
#from shutil import make_archive
#make_archive('deploy-on-pynq', 'zip', final_output_dir+"/deploy")
```
%% Cell type:markdown id: tags:
You can now download the created zipfile (**File -> Open**, mark the checkbox next to the `deploy-on-pynq.zip` and select Download from the toolbar), then copy it to your PYNQ board (for instance via `scp` or `rsync`). Then, run the following commands **on the PYNQ board** to extract the archive and run the validation:
%% Cell type:markdown id: tags:
```shell
unzip deploy-on-pynq.zip -d finn-cybsec-mlp-demo
cd finn-cybsec-mlp-demo/driver
sudo python3.6 -m pip install bitstring
sudo python3.6 validate-unsw-nb15.py --batchsize 1000
```
%% Cell type:markdown id: tags:
You should see `Final accuracy: 91.868293` at the end. You may have noticed that the validation doesn't *quite* run at 1M inferences per second. This is because of the Python packing/unpacking and data movement overheads. To see this in more detail, the generated driver includes a benchmarking mode that shows the runtime breakdown:
%% Cell type:markdown id: tags:
```shell
sudo python3.6 driver.py --exec_mode throughput_test --bitfile ../bitfile/finn-accel.bit --batchsize 1000
cat nw_metrics.txt
```
%% Cell type:markdown id: tags:
```{'runtime[ms]': 1.0602474212646484,
'throughput[images/s]': 943176.0737575893,
'DRAM_in_bandwidth[Mb/s]': 70.7382055318192,
'DRAM_out_bandwidth[Mb/s]': 0.9431760737575894,
'fclk[mhz]': 100.0,
'batch_size': 1000,
'fold_input[ms]': 9.679794311523438e-05,
'pack_input[ms]': 0.060115814208984375,
'copy_input_data_to_device[ms]': 0.002428770065307617,
'copy_output_data_from_device[ms]': 0.0005249977111816406,
'unpack_output[ms]': 0.3773000240325928,
'unfold_output[ms]': 6.818771362304688e-05}```
%% Cell type:markdown id: tags:
Here, the various `pack_input/unpack_output` calls show the overhead of packing/unpacking the inputs/outputs to convert from numpy arrays to the bit-contiguous data representation our accelerator expects. The `copy_input_data_to_device` and `copy_output_data_from_device` indicate the cost of moving the data between the CPU and accelerator memories. These overheads can dominate the execution time when running with small batch sizes.
Finally, we can see that `throughput[images/s]`, which is the pure hardware throughput without any software and data movement overheads, is close to 1M inferences per second.
......
......@@ -11,7 +11,7 @@ inp = unsw_nb15_data["train"][:, :-1]
inp = np.concatenate([inp, np.zeros((inp.shape[0], 7))], -1).astype(np.float32)
out = unsw_nb15_data["train"][:, -1].astype(np.float32)
indices = np.where(out == 0)[:10]
indices = np.where(out == 0)[0][:1]
inp = 2 * inp[indices] - 1
out = 2 * out[indices] - 1
......@@ -19,6 +19,7 @@ out = 2 * out[indices] - 1
np.save(open("input.npy", "wb"), inp)
np.save(open("expected_output.npy", "wb"), out)
model_dir = os.environ['FINN_ROOT'] + "/notebooks/end2end_example/cybersecurity"
model_file = model_dir + "/cybsec-mlp-ready.onnx"
......@@ -29,20 +30,42 @@ if os.path.exists(estimates_output_dir):
shutil.rmtree(estimates_output_dir)
print("Previous run results deleted!")
os.environ["RTLSIM_TRACE_DEPTH"] = "3"
steps = [
"step_qonnx_to_finn",
"step_tidy_up",
"step_streamline",
"step_convert_to_hls",
"step_create_dataflow_partition",
"step_distribute_dataflow",
"step_target_fps_parallelization",
"step_apply_folding_config",
"step_minimize_bit_width",
"step_insert_accl",
"step_verify_with_cppsim",
"step_generate_estimate_reports",
"step_hls_codegen",
"step_hls_ipgen",
"step_set_fifo_depths",
"step_create_stitched_ip",
]
cfg_estimates = build.DataflowBuildConfig(
verbose = True,
output_dir = estimates_output_dir,
steps = steps,
mvau_wwidth_max = 80,
target_fps = 1000000,
synth_clk_period_ns = 10.0,
fpga_part = "xc7z020clg400-1",
steps = build_cfg.estimate_only_dataflow_steps,
generate_outputs=[
generate_outputs = [
build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
build_cfg.DataflowOutputType.STITCHED_IP,
],
verify_steps=[build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM],
board = 'U250',
num_boards = 2,
verify_steps = [build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM],
board = 'U250',
num_boards = 2,
save_intermediate_models = True,
)
......
......@@ -162,6 +162,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
sys.stderr = stderr_orig
time_per_step[step_name] = step_end - step_start
chkpt_name = "%s.onnx" % (step_name)
# TODO: Make this work in the distributed setting
if cfg.save_intermediate_models:
intermediate_model_dir = cfg.output_dir + "/intermediate_models"
if not os.path.exists(intermediate_model_dir):
......
......@@ -117,8 +117,10 @@ default_build_dataflow_steps = [
"step_streamline",
"step_convert_to_hls",
"step_create_dataflow_partition",
"step_distribute_dataflow",
"step_target_fps_parallelization",
"step_apply_folding_config",
"step_insert_accl",
"step_minimize_bit_width",
"step_generate_estimate_reports",
"step_hls_codegen",
......
......@@ -251,7 +251,6 @@ def verify_step(step_type):
return decorator
def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
if not cfg.rtlsim_use_vivado_comps:
need_restitch = False
......@@ -371,7 +370,6 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
return model
def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig):
"""Convert eligible nodes to `HLSCustomOp` subclasses that represent HLS
layers. Which nodes and particular configurations can be converted to HLS
......@@ -505,10 +503,13 @@ def step_insert_accl(model: ModelWrapper, cfg: DataflowBuildConfig):
sdp_model_file = sdp_inst.get_nodeattr("model")
sdp_model = ModelWrapper(sdp_model_file)
sdp_model = sdp_model.transform(InsertACCL(world_size, rank, recv_from, send_to))
sdp_model = sdp_model.transform(GiveUniqueNodeNames())
sdp_model.save(sdp_model_file)
elif len(d_nodes) > 1:
assert len(d_nodes) == 1, "There should only be one DistributedDataflow node"
model = model.transform(GiveUniqueNodeNames())
return model
......@@ -571,6 +572,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
return model
@map_over_sdps
def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
"Generate Vivado HLS code to prepare HLSCustomOp nodes for IP generation."
......@@ -578,6 +580,7 @@ def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
return model
@map_over_sdps
def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
"""Run Vivado HLS synthesis on generated code for HLSCustomOp nodes,
in order to generate IP blocks."""
......@@ -592,6 +595,7 @@ def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
return model
@map_over_sdps
def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
"""
Depending on the auto_fifo_depths setting, do one of the following:
......@@ -690,6 +694,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
return model
@map_over_sdps
def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
"""Create stitched IP for a graph after all HLS IP blocks have been generated.
Depends on the DataflowOutputType.STITCHED_IP output product."""
......
......@@ -48,6 +48,9 @@ accl_word_size = 512
class ACCLOp(HLSCustomOp):
barriers = defaultdict(lambda: threading.Barrier(2))
def __init__(self, onnx_node, **kwargs):
super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
"NumChannels": ("i", True, 0),
......@@ -55,7 +58,6 @@ class ACCLOp(HLSCustomOp):
"dataType": ("s", True, ""),
# shape describing input vecs per execution
"numInputVectors": ("ints", False, [1]),
"usedBits": ("i", True, 0),
# accl specific attrs
"startPort": ("i", False, 5500),
"rank": ("i", True, 0),
......@@ -152,7 +154,43 @@ compilation transformations?
"""Returns FINN DataType of output. (Same as input datatype)"""
return self.get_input_datatype()
def get_folded_input_shape(self):
def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = []
def defines(self, mode):
# Do the includes here as well as they have dependencies on the defines
self.code_gen_dict["$DEFINES$"] = []
if mode == 'cppsim':
self.code_gen_dict["$DEFINES$"] += [
"#define CPPSIM",
'#include "cclo_bfm.h"',
'#include "accl/sim.hpp"',
]
elif mode == 'ipgen':
self.code_gen_dict["$DEFINES$"] += [
'#define ACCL_SYNTHESIS',
]
self.code_gen_dict["$DEFINES$"] += [
'#include <accl_hls.h>',
'#include "accl/funcs.hpp"',
]
def get_stream_width(self):
tbits = self.get_input_datatype().bitwidth()
return tbits * self.get_nodeattr("NumChannels")
def verify_node(self):
...
class ACCLOut(ACCLOp):
def get_instream_width(self, ind=0):
return self.get_stream_width()
def get_outstream_width(self, ind=0):
return accl_word_size
def get_folded_input_shape(self, ind=0):
ich = self.get_nodeattr("NumChannels")
vecs = list(self.get_nodeattr("numInputVectors"))
......@@ -161,71 +199,61 @@ compilation transformations?
return (*vecs, fold, ich)
def get_folded_output_shape(self):
return self.get_folded_input_shape()
def get_folded_output_shape(self, ind=0):
ich = self.get_nodeattr("NumChannels")
vecs = list(self.get_nodeattr("numInputVectors"))
num_bits = np.prod(vecs) * ich * self.get_input_datatype().bitwidth()
fold = int(math.ceil(num_bits / accl_word_size))
def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = [
'#include <accl_hls.h>',
'#include "cclo_bfm.h"',
'#include "accl/funcs.hpp"',
]
return (fold, 1)
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
'#pragma HLS INTERFACE axis port=cmd_to_cclo',
'#pragma HLS INTERFACE axis port=sts_from_cclo',
'#pragma HLS INTERFACE axis port=data_to_cclo',
'#pragma HLS INTERFACE axis port=data_from_cclo',
'#pragma HLS INTERFACE axis port=stream',
'#pragma HLS INTERFACE axis port=in0_{}'.format(self.hls_sname()),
]
def get_stream_width(self):
tbits = self.get_input_datatype().bitwidth()
return tbits * self.get_nodeattr("NumChannels")
self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
def strm_decl(self):
start_port = self.get_nodeattr("startPort")
rank = self.get_nodeattr("rank")
world_size = self.get_nodeattr("worldSize")
dest = self.get_nodeattr("worldSize")
self.code_gen_dict["$STREAMDECLARATIONS$"] = [
'hlslib::Stream<command_word> cmd_to_cclo("cmd_to_cclo"), sts_from_cclo("sts_from_cclo");',
'hlslib::Stream<stream_word, 512> data_from_cclo("data_from_cclo"), data_to_cclo("data_to_cclo");',
'hls::stream<ap_uint<{}>> stream;'.format(self.get_stream_width()),
'std::vector<unsigned int> dest{9};',
'hls::stream<ap_uint<{}>> in0_{};'.format(self.get_stream_width(), self.hls_sname()),
'std::unique_ptr<ACCL::ACCL> accl = init_accl({}, {}, {});'.format(world_size, rank, start_port),
'std::unique_ptr<CCLO_BFM> cclo = init_cclo_and_wait_for_input({}, {}, {}, dest, cmd_to_cclo, sts_from_cclo, data_from_cclo, data_to_cclo);'.format(start_port, rank, world_size, dest),
'std::unique_ptr<CCLO_BFM> cclo = init_cclo_and_wait_for_input({}, {}, {}, cmd_to_cclo, sts_from_cclo, data_from_cclo, data_to_cclo);'.format(start_port, rank, world_size),
'ap_uint<32> comm_adr = accl->get_communicator_addr();',
'ap_uint<32> dpcfg_adr = accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32});',
]
def defines(self, mode):
self.code_gen_dict["$DEFINES$"] = ['']
def verify_node(self):
...
class ACCLOut(ACCLOp):
def get_instream_width(self, ind=0):
return self.get_stream_width()
def get_outstream_width(self, ind=0):
return accl_width
def docompute(self):
stream_width = self.get_instream_width()
itype_bits = self.get_input_datatype().bitwidth()
shape = self.get_folded_output_shape()
shape = self.get_folded_input_shape()
num_bits = np.prod(shape) * itype_bits
step = math.gcd(stream_width, accl_word_size)
dest = self.get_nodeattr("otherRank")
self.code_gen_dict["$DOCOMPUTE$"] = [
'accl_out<{}, {}>({}, comm_adr, dpcfg_adr, cmd_to_cclo, sts_from_cclo, data_to_cclo, stream);'.format(stream_width, num_bits, dest),
'cclo->stop();',
'''accl_out<{}, {}, {}>(
{},
comm_adr,
dpcfg_adr,
cmd_to_cclo,
sts_from_cclo,
data_to_cclo,
in0_{}
);'''.format(stream_width, num_bits, step, dest, self.hls_sname()),
]
def execute_node(self, context, graph):
......@@ -246,7 +274,7 @@ class ACCLOut(ACCLOp):
str(context[node.input[0]].dtype) == "float32"
), """Input datatype is
not float32 as expected."""
expected_inp_shape = self.get_folded_output_shape()
expected_inp_shape = self.get_folded_input_shape()
reshaped_input = context[node.input[0]].reshape(expected_inp_shape)
if self.get_input_datatype() == DataType["BIPOLAR"]:
......@@ -277,8 +305,8 @@ class ACCLOut(ACCLOp):
# note: the innermost dim is reversed for the input
self.code_gen_dict["$READNPYDATA$"].append(
'npy2apintstream<%s, %s, %d, %s>("%s", stream, false);'
% (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
% (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in, self.hls_sname())
)
def save_as_npy(self):
......@@ -288,15 +316,87 @@ class ACCLOut(ACCLOp):
self.code_gen_dict["$DATAOUTSTREAM$"] = ['']
def blackboxfunction(self):
pass
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
'''void {}(
STREAM<command_word> &cmd_to_cclo,
STREAM<command_word> &sts_from_cclo,
STREAM<stream_word> &data_to_cclo,
ap_uint<32> comm_adr,
ap_uint<32> dpcfg_adr,
hls::stream<ap_uint<{}>> &in0_{}
)'''
.format(
self.onnx_node.name,
self.get_instream_width(),
self.hls_sname()
)
]
def get_verilog_top_module_intf_names(self):
intf_names = super().get_verilog_top_module_intf_names()
intf_names["m_axis"] = [("data_to_cclo", accl_word_size), ("cmd_to_cclo", 32)]
intf_names["s_axis"].append(("sts_from_cclo", 32))
return intf_names
def code_generation_ipi(self):
cmd = super().code_generation_ipi()
cmd += [
'make_bd_intf_pins_external [get_bd_intf_pins {}/{}]'.format(
self.onnx_node.name,
pin_name
)
for pin_name in ["cmd_to_cclo", "sts_from_cclo"]
]
return cmd
class ACCLIn(ACCLOp):
def get_outstream_width(self, ind=0):
return accl_width
def get_instream_width(self, ind=0):
return accl_word_size
def get_outstream_width(self, ind=0):
return self.get_stream_width()
def get_folded_input_shape(self, ind=0):
ich = self.get_nodeattr("NumChannels")
vecs = list(self.get_nodeattr("numInputVectors"))
num_bits = np.prod(vecs) * ich * self.get_input_datatype().bitwidth()
fold = int(math.ceil(num_bits / accl_word_size))
return (fold, 1)
def get_folded_output_shape(self, ind=0):
ich = self.get_nodeattr("NumChannels")
vecs = list(self.get_nodeattr("numInputVectors"))
ich_bits = ich * self.get_input_datatype().bitwidth()
fold = int(math.ceil(ich_bits / accl_word_size))
return (*vecs, fold, ich)
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
'#pragma HLS INTERFACE axis port=data_from_cclo',
'#pragma HLS INTERFACE axis port=out_{}'.format(self.hls_sname()),
]
self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
def strm_decl(self):
start_port = self.get_nodeattr("startPort")
rank = self.get_nodeattr("rank")
world_size = self.get_nodeattr("worldSize")
self.code_gen_dict["$STREAMDECLARATIONS$"] = [
'hlslib::Stream<command_word> cmd_to_cclo("cmd_to_cclo"), sts_from_cclo("sts_from_cclo");',
'hlslib::Stream<stream_word, 512> data_from_cclo("data_from_cclo"), data_to_cclo("data_to_cclo");',
'hls::stream<ap_uint<{}>> out_{};'.format(self.get_stream_width(), self.hls_sname()),
'std::unique_ptr<CCLO_BFM> cclo = init_cclo_and_wait_for_input({}, {}, {}, cmd_to_cclo, sts_from_cclo, data_from_cclo, data_to_cclo);'.format(start_port, rank, world_size),
]
def docompute(self):
stream_width = self.get_stream_width()
......@@ -304,11 +404,18 @@ class ACCLIn(ACCLOp):
shape = self.get_folded_output_shape()
num_bits = np.prod(shape) * itype_bits
step = math.gcd(stream_width, accl_word_size)
source = self.get_nodeattr("otherRank")
self.code_gen_dict["$DOCOMPUTE$"] = [
'accl_in<{}, {}>({}, data_from_cclo, stream);'.format(stream_width, num_bits, source),
'cclo->stop();',
'accl_in<{}, {}, {}>({}, data_from_cclo, out_{});'.format(
stream_width,
num_bits,
step,
source,
self.hls_sname()
),
]
def execute_node(self, context, graph):
......@@ -361,17 +468,31 @@ class ACCLIn(ACCLOp):
shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
self.code_gen_dict["$DATAOUTSTREAM$"] = [
'apintstream2npy<%s, %s, %d, %s>(stream, %s, "%s", false);'
'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
% (
packed_hls_type,
elem_hls_type,
elem_bits,
npy_type,
self.hls_sname(),
shape_cpp_str,
npy_out,
),
]
def blackboxfunction(self):
pass
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
'void {}(STREAM<stream_word> &data_from_cclo, hls::stream<ap_uint<{}>> &out_{})'
.format(
self.onnx_node.name,
self.get_outstream_width(),
self.hls_sname()
)
]
def get_verilog_top_module_intf_names(self):
intf_names = super().get_verilog_top_module_intf_names()
intf_names["s_axis"] = [("data_from_cclo", accl_word_size)]
return intf_names
......@@ -162,16 +162,23 @@ class DistributedDataflow(CustomOp):
subprocess.run(["/usr/bin/cmake", "."],
cwd=emulator_dir, stdout=subprocess.PIPE)
emulator = subprocess.Popen(
["python3", "run.py", "-n 2", "--no-kernel-loopback", "-l 1"], cwd=emulator_dir)
ret = execute_distributed_onnx(
model, inp_ctx, return_full_exec_context)
world_size = int(model.get_metadata_prop("world_size"))
parent_proc = psutil.Process(emulator.pid)
for child in parent_proc.children(recursive=True):
child.kill()
emulator.kill()
emulator = subprocess.Popen([
"python3",
"run.py",
f"-n {world_size}",
"--no-kernel-loopback"
], cwd=emulator_dir)
try:
ret = execute_distributed_onnx(model, inp_ctx, return_full_exec_context)
finally:
parent_proc = psutil.Process(emulator.pid)
for child in parent_proc.children(recursive=True):
child.kill()
emulator.kill()
for i, node_oname in enumerate(node.output):
model_oname = model.graph.output[i].name
......@@ -181,3 +188,4 @@ class DistributedDataflow(CustomOp):
for tname in ret.keys():
if tname not in [x.name for x in model.graph.output]:
context[node.name + "_" + tname] = ret[tname]
......@@ -344,12 +344,14 @@ class HLSCustomOp(CustomOp):
def ipgen_singlenode_code(self):
"""Builds the bash script for IP generation using the CallHLS utility."""
node = self.onnx_node
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
builder = CallHLS()
builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name))
builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name))
builder.build(code_gen_dir)
ipgen_path = builder.ipgen_path
assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path)
self.set_nodeattr("ipgen_path", ipgen_path)
ip_path = ipgen_path + "/sol1/impl/ip"
......
......@@ -90,11 +90,13 @@ set config_bnnlibdir "$::env(FINN_ROOT)/deps/finn-hlslib"
puts "finn-hlslib dir: $config_bnnlibdir"
set config_customhlsdir "$::env(FINN_ROOT)/custom_hls"
puts "custom HLS dir: $config_customhlsdir"
set config_acclhlsdir "$::env(FINN_ROOT)/ACCL/driver/hls"
puts "ACCL HLS dir: $config_acclhlsdir"
set config_toplevelfxn "$TOPFXN$"
set config_clkperiod $CLKPERIOD$
open_project $config_proj_name
add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++14 -I$config_bnnlibdir -I$config_customhlsdir"
add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++14 -I$config_bnnlibdir -I$config_customhlsdir -I$config_acclhlsdir"
set_top $config_toplevelfxn
open_solution sol1
......
......@@ -48,7 +48,7 @@ from finn.util.fpgadataflow import is_fpgadataflow_node
def is_external_input(model, node, i):
# indicate whether input i of node should be made external
# True only if input is unconnected and has no initializer
# Only esception is second input of FC layers when mem_mode is external
# Only exception is second input of FC layers when mem_mode is external
node_inst = getCustomOp(node)
producer = model.find_producer(node.input[i])
if producer is None:
......@@ -286,9 +286,9 @@ class CreateStitchedIP(Transformation):
ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream")
if self.signature:
ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info")
if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]:
if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA", "ACCLIn"]:
warnings.warn(
"""First node is not StreamingFIFO or IODMA.
"""First node is not StreamingFIFO, IODMA or ACCLIn.
You may experience incorrect stitched-IP rtlsim or hardware
behavior. It is strongly recommended to insert FIFOs prior to
calling CreateStitchedIP."""
......
......@@ -36,7 +36,7 @@ class DistributeDataflow(Transformation):
self.target_clk_ns,
self.target_platform,
self.ndevices,
# TODO: Make sure we are using multiple devices
# TODO: Remove this after testing
abs_anchors=[(0, [3]), (1, [7])]
)
......@@ -68,10 +68,12 @@ class DistributeDataflow(Transformation):
child_node.op_type = "DistributedDataflow"
new_child_node_inst = getCustomOp(child_node)
new_child_node_inst.set_nodeattr("world_size", len(p_nodes))
distr_model.set_metadata_prop("world_size", str(len(p_nodes)))
distr_model_file = self.partition_model_dir + "/distributed_dataflow.onnx"
distr_model.save(distr_model_file)
new_child_node_inst.set_nodeattr("model", distr_model_file)
return (model, False)
......@@ -128,6 +128,7 @@ class InsertACCL(Transformation):
)
model.graph.node.append(dma_node)
modified = True
if modified:
model = model.transform(SortGraph())
return (model, modified)
......@@ -181,8 +181,8 @@ class InsertFIFO(Transformation):
graph_in_names = [x.name for x in model.graph.input]
for graph_in_name in graph_in_names:
first_node = model.find_consumer(graph_in_name)
# insert FIFO as first node, except when first node is DMA
if first_node.op_type != "StreamingFIFO" and first_node.op_type != "IODMA":
# insert FIFO as first node, except when first node is DMA, ACCL
if first_node.op_type not in ["StreamingFIFO", "IODMA", "ACCLIn"]:
inp_ind = list(first_node.input).index(graph_in_name)
n_input = first_node.input[inp_ind]
n0 = getCustomOp(first_node)
......@@ -230,11 +230,11 @@ class InsertFIFO(Transformation):
% (graph_in_name, fifo_depth)
)
# insert FIFO as last node, except when last node is DMA
# insert FIFO as last node, except when last node is DMA, ACCL
graph_out_names = [x.name for x in model.graph.output]
for graph_out_name in graph_out_names:
final_node = model.find_producer(graph_out_name)
if final_node.op_type != "StreamingFIFO" and final_node.op_type != "IODMA":
if final_node.op_type not in ["StreamingFIFO", "IODMA", "ACCLOut"]:
assert (
final_node.op_type != "TLastMarker"
), """Insert tlast marker should be done
......
......@@ -338,6 +338,7 @@ class InsertAndSetFIFODepths(Transformation):
# set sufficiently large threshold for 1 image to fully execute and exit
ncycles = int(latency + max_cycles)
# prepare pyverilator model
sim = pyverilate_stitched_ip(model)
......@@ -382,7 +383,7 @@ class InsertAndSetFIFODepths(Transformation):
# convnet, two inputs are typically enough to fill entire
# layer pipeline due to overlaps
n_inputs = 2
sim = verilator_fifosim(model, n_inputs)
sim = verilator_fifosim(model, n_inputs, max_iters=10000)
for ind, node in enumerate(fifo_nodes):
maxcount_name = "maxcount_%d" % ind
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment