diff --git a/Dockerfile b/Dockerfile index c220e6ac6f4f4b24f2a10af778a0740137ee949f..9cbb364b936c27c09bcfab71ef3866dff23f95ee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -49,6 +49,30 @@ RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config RUN pip install sphinx RUN pip install sphinx_rtd_theme +# cloning dependency repos +# Brevitas +RUN git clone --branch feature/finn_onnx_export https://github.com/Xilinx/brevitas.git /workspace/brevitas +RUN git -C /workspace/brevitas checkout ed1a3b70a14a91853066ece630421e89660d93e9 + +# Brevitas examples +RUN git clone https://github.com/maltanar/brevitas_cnv_lfc.git /workspace/brevitas_cnv_lfc +RUN git -C /workspace/brevitas_cnv_lfc checkout a443708b382cbcfd69d19c9fc3fe94b2a2c03d71 + +# CNPY +RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy +RUN git -C /workspace/cnpy checkout 4e8810b1a8637695171ed346ce68f6984e585ef4 + +# FINN hlslib +RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib +RUN git -C /workspace/finn-hlslib checkout b5dc957a16017b8356a7010144b0a4e2f8cfd124 + +# PyVerilator +RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator +RUN git -C /workspace/pyverilator checkout 307fc5c82db748620836307a2002fdc9fe170226 + +# PYNQ-HelloWorld +RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld +RUN git -C /workspace/PYNQ-HelloWorld checkout ef4c438dff4bd346e5f6b8d4eddfd1c8a3999c03 # Note that we expect the cloned finn directory on the host to be # mounted on /workspace/finn -- see run-docker.sh for an example diff --git a/README.md b/README.md index 0a70f27b675c105d76259edcacb78251419a5205..b408b1a69d6833382763795f35002e2b3322f09d 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s ## What's New in FINN? -* **2020-02-27:** FINN v0.2b (beta) is released, which is a clean-slate reimplementation of the framework. Currently only fully-connected networks are supported for the end-to-end flow. Please see the release blog post for a summary of the key features. +* **2020-02-28:** FINN v0.2b (beta) is released, which is a clean-slate reimplementation of the framework. Currently only fully-connected networks are supported for the end-to-end flow. Please see the release blog post for a summary of the key features. ## Documentation diff --git a/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md b/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md new file mode 100644 index 0000000000000000000000000000000000000000..319c03e14229f4866279cb09a4b70419ce2fcdc7 --- /dev/null +++ b/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md @@ -0,0 +1,33 @@ +--- +layout: post +title: "FINN v0.2b (beta) is released" +author: "Yaman Umuroglu" +--- + +We've been working on the new version of the FINN compiler for a while, and today we are excited to announce our first beta release to +give you a taste of how things are shaping up! + +Here's a quick overview of the key features: + +* <b>Train and export highly-quantized networks in PyTorch using Brevitas.</b> You can use <a href="https://github.com/Xilinx/brevitas">Brevitas</a>, + our PyTorch library for quantization-aware training to train networks with few-bit weights and activations, then export them into + FINN-ONNX to be used by the FINN compiler. + +* <b>Fully transparent end-to-end flow.</b> We support taking quantized networks (with limitations, see bottom of post) all the way down to a + customized FPGA bitstream. This happens across many steps ranging from streamlining to Vivado IPI stitching, and each step is fully + visible to the user. So if you are happy with just the threshold-activation (streamlined) QNN in ONNX, or if you want to take the + generated Vivado IP block and integrate it into your own IPI design, it's easy to break out of the flow at any step. + We also provide a variety of mechanisms to verify the design at different steps. + +* <b>ONNX-based intermediate representation.</b> We use ONNX with some custom nodes and annotations as our intermediate representation. As the + FINN compiler transforms the network across many steps to produce an FPGA bitstream, you can view and explore the transformed network + using the excellent <a href="https://www.lutzroeder.com/ai/netron">Netron</a> viewer from the comfort of your web browser. + +* Tutorials and documentation. We have prepared a set of <a href="https://github.com/Xilinx/finn/tree/master/notebooks">Jupyter notebooks</a> + to let you experiment with some of the things FINN can do, covering the basics, demonstrating the end-to-end flow on an example network, + and discussing some of the internals for more advanced users and developers. We also have Sphinx-generated documentation on + <a href="http://finn.readthedocs.io/">readthedocs</a> for more information on the FINN compiler and its API. + +The release (tagged 0.2b) is now available on GitHub. Currently it's a beta release and only supports fully-connected layers in linear +(non-branching) topologies, but we're actively working on the end-to-end convolution support for the next release. Further down the +road, we hope to support more advanced topologies and provide end-to-end examples for MobileNet and ResNet-50. diff --git a/docs/_posts/2020-03-11-rn50-released.md b/docs/_posts/2020-03-11-rn50-released.md new file mode 100644 index 0000000000000000000000000000000000000000..baa924410cf56a07e22a6c85450205d18a4d45bb --- /dev/null +++ b/docs/_posts/2020-03-11-rn50-released.md @@ -0,0 +1,75 @@ +--- +layout: post +title: "ResNet50 for Alveo released" +author: "Lucian Petrica" +--- + +We're pleased to announce as part of the FINN project our release of the first fully quantized, all-dataflow ResNet50 inference accelerator for Xilinx Alveo boards. The source code is available on [GitHub](https://github.com/Xilinx/ResNet50-PYNQ) and we provide a Python [package](https://pypi.org/project/resnet50-pynq/) and Jupyter Notebook to get you started and show how the accelerator is controlled using [PYNQ](http://www.pynq.io/) for Alveo. +Built using a custom [FINN](https://xilinx.github.io/finn/about.html) streamlining flow, which is not yet public, +this accelerator showcases the advantage of deep quantization for FPGA acceleration of DNN workloads in the datacenter. +The key performance metrics are: + +FPGA Device | ImageNet Accuracy | Max FPS | Min Latency | Power @ Max FPS | Power @ Min Latency +---------- |---------- |---------- |---------- |---------- |---------- +Alveo U250 | 65% Top-1 / 85% Top-5 | 2000 | 2 ms | 70 W | 40 W + +In addition to demonstrating the achievable performance of low-precision dataflow acceleration on Alveo, the ResNet50 design +serves as proof of concept for two key features of future FINN releases: +modular build flows based on Vivado IP Integrator, and pure Python interface to the accelerator. + +## Modular build flow + +FINN accelerators targetting embedded parts, such as the [BNN-PYNQ](https://github.com/Xilinx/BNN-PYNQ) accelerators, have in the past implemented the +entire acceleration functionality in a singe monolithic HLS C++ description. +For large datacenter-class designs this approach is not feasible, as the HLS simulation and synthesis times become very large. + +Instead, here we identify the key computational pattern, the residual block, which we implement as a HLS C++ IP block by assembling multiple Matrix-Vector-Activation Units from the [FINN HLS Library](https://github.com/Xilinx/finn-hlslib). +We then construct the accelerator by instantiating and connecting multiple residual blocks together in a Vivado IPI block design, which are then synthesized in parallel and exported as a netlist IP. + +<img align="left" src="https://xilinx.github.io/finn/img/rn50-ipi.png" alt="drawing" style="margin-right: 20px" width="300"/> + + +In our flow, this IP is linked by Vitis into an Alveo platform, but users are free to integrate the ResNet50 IP in their own Vivado-based flows and augment it with other HLS or RTL IP. See our build scripts and documentation for more information. + +## Pure Python host interface + +Using PYNQ for Alveo, users can interface directly with the ResNet50 accelerator in Python. +To program the accelerator, an Overlay object is created from an XCLBin file produced by Vitis. + +```Python +import pynq + +ol=pynq.Overlay("resnet50.xclbin") +accelerator=ol.resnet50_1 +``` + +Before using the accelerator, we must configure the weights of the fully-connected layer in DDR Bank 0. +Assuming the weights are already loaded in the NumPy array `fcweights`, we allocate a buffer +of appropriate size, copy the weights into it, and flush it to the Alveo DDR Bank 0. + +```Python +fcbuf = pynq.allocate((1000,2048), dtype=np.int8, target=ol.bank0) +fcbuf[:] = fcweights +fcbuf.sync_to_device() +``` + +To perform inference we first allocate input and output buffers for one image, and copy the contents of the NumPy array `img` into the input buffer. +We then flush the input data to the Alveo DDR Bank 0, and call the accelerator providing as arguments +the input and output buffers, the FC layer weights buffer, and the number of images to process, in this case just one. +After the call finishes, we pull the output buffer data from the accelerator DDR to host memory and copy its +contents to user memory in a NumPy array. + +```Python +inbuf = pynq.allocate((224,224,3), dtype=np.int8, target=ol.bank0) +outbuf = pynq.allocate((5,), dtype=np.uint32, target=ol.bank0) + +inbuf[:] = img +inbuf.sync_to_device() + +accelerator.call(inbuf, outbuf, fcbuf, 1) + +outbuf.sync_from_device() +results = np.copy(outbuf) +``` + +It's that easy! See our Jupyter Notebook demo and application examples for more details. diff --git a/docs/_posts/2020-03-27-brevitas-quartznet-release.md b/docs/_posts/2020-03-27-brevitas-quartznet-release.md new file mode 100644 index 0000000000000000000000000000000000000000..0940f754815c834662919404860b8a7b00d08e64 --- /dev/null +++ b/docs/_posts/2020-03-27-brevitas-quartznet-release.md @@ -0,0 +1,92 @@ +--- +layout: post +title: "Quantized QuartzNet with Brevitas for efficient speech recognition" +author: "Giuseppe Franco" +--- + +*Although not yet supported in FINN, we are excited to show you how Brevitas and quantized neural network training techniques can be applied to models beyond image classification.* + +We are pleased to announce the release of quantized pre-trained models of [QuartzNet](https://arxiv.org/abs/1904.03288) for efficient speech recognition. +They can be found at the [following link](https://github.com/Xilinx/brevitas/tree/master/examples/speech_to_text), with a brief +explanation on how to test them. +The quantized version of QuartzNet has been trained using [Brevitas](https://github.com/Xilinx/brevitas), an experimental library for quantization-aware training. + +QuartzNet, whose structure can be seen in Fig. 1, is a convolution-based speech-to-text network, based on a similar structure as [Jasper](https://arxiv.org/abs/1904.03288). + +| <img src="https://xilinx.github.io/finn/img/QuartzNet.jpg" alt="QuartzNet Structure" title="QuartzNet Structure" width="450" height="500" align="center"/>| +| :---:| +| *Fig. 1 QuartzNet Model, [source](https://arxiv.org/abs/1910.10261)* | + +The starting point is the mel-spectrogram representation of the input audio file. +Through repeated base building blocks of 1D Convolutions (1D-Conv), Batch-Normalizations (BN), and ReLU with residual connections, +QuartzNet is able to reconstruct the underlying text. +The main difference with respect to Jasper is the use of Depthwise and Pointwise 1D-Conv (Fig. 2a), instead of 'simple' 1D-Conv (Fig. 2b). +Thanks to this structure, QuartzNet is able to achieve better performance in terms of Word Error Rate (WER) compared to Jasper, +using *only* 19.9 M parameters, compared to 333M parameters of Jasper. + +Moreover, the authors proposed a grouped-pointwise convolution strategy that allows to greatly reduce the numbers of parameters, +down to 8.7M, with a little degradation in accuracy. + +| <img src="https://xilinx.github.io/finn/img/quartzPic1.jpg" alt="QuartzNet block" title="QuartzNet block" width="130" height="220" align="center"/> | <img src="https://xilinx.github.io/finn/img/JasperVertical4.jpg" alt="Jasper block" title="Jasper block" width="130" height="220" align="center"/>| +| :---:|:---:| +| *Fig. 2a QuartzNet Block, [source](https://arxiv.org/abs/1910.10261)* | *Fig. 2b Jasper Block [source](https://arxiv.org/abs/1904.03288)* | + + +The authors of QuartzNet proposes different BxR configurations. Each B<sub>i</sub> block consist of the same base building block described above, +repeated R times. +Different BxR configurations have been trained on several different datasets (Wall Street Journal, +LibriSpeech + Mozilla Common Voice, LibriSpeech only). + +For our quantization experiments, we focus on the 15x5 variant trained on LibriSpeech with spec-augmentation without grouped convolutions. +More detail about this configuration can be found in the paper and on a [related discussion with the authors](https://github.com/NVIDIA/NeMo/issues/230). + +Started from the [official implementation](https://github.com/NVIDIA/NeMo/blob/master/examples/asr/quartznet.py), +the first step was to implement a quantized version of the topology in Brevitas, using quantized convolutions and activations. + +After implementing the quantized version, the second step was to re-train the model, starting +from the [pre-trained models](https://ngc.nvidia.com/catalog/models/nvidia:quartznet_15x5_ls_sp) +kindly released by the authors. + +We focused on three main quantization configurations. Two configurations at 8 bit, with per-tensor and per-channel scaling, +and one configuration at 4 bit, with per-channel scaling. + +We compare our results with the one achieved by the authors, not only in terms of pure WER, but also the parameter's memory footprint, +and the number of operations performed. Note that the WER is always based on greedy decoding. The results can be seen in Fig. 3 and Fig. 4, +and are summed up in Table 1. + +| Configuration | Word Error Rate (WER) | Memory Footprint (MegaByte) | Mega MACs | +| :-----------: | :-------------------: | :-------------------------: | :-------: | +| FP 300E, 1G | 11.58% | 37.69 | 1658.54 | +| FP 400E, 1G | 11.08% | 37.69 | 1658.54 | +| FP 1500E, 1G | 10.78% | 37.69 | 1658.54 | +| FP 300E, 2G | 12.52% | 24.06 | 1058.75 | +| FP 300E, 4G | 13.48% | 17.25 | 758.86 | +| 8 bit, 1G Per-Channel scaling| 10.98% | 18.58 | 414.63 | +| 8 bit, 1G Per-Tensor scaling | 11.03% | 18.58 | 414.63 | +| 4 bit, 1G Per-Channel scaling| 12.00% | 9.44 | 104.18 | + +| <img src="https://xilinx.github.io/finn/img/WERMB.jpg" alt="WERvsMB" title="WERvsMB" width="500" height="300" align="center"/> | +| :---:| +| *Fig. 3 Memory footprint over WER on LibriSpeech dev-other* | + +| <img src="https://xilinx.github.io/finn/img/WERNops.jpg" alt="WERvsMACs" title="WERvsMACs" width="500" height="300" align="center"/> | +| :---: | +| *Fig. 4 Number of MACs Operations over WER on LibriSpeech dev-other* | + +In evaluating the memory footprint, we consider half-precision (16 bit) Floating Point (FP) numbers for the original QuartzNet. +As we can see on Fig. 3, the quantized implementations are able to achieve comparable accuracy compared to the corresponding floating-point verion, +while greatly reducing the memory occupation. In the graph, the terms <em>E</em> stands for Epochs, while <em>G</em> for Groups, referring +to the numbers of groups used for the grouped convolutions. +In case of our 4 bit implementation, the first and last layer are left at 8 bit, but this is taken in account both in the computation +of the memory occupation and of the number of operations. +Notice how the 4 bit version is able to greatly reduce the memory footprint of the network compared to the grouped convolution variants, while still granting better accuracy. + + +For comparing accuracy against the number of multiply-accumulate (MAC), we consider 16 bit floating-point multiplications as 16 bit integer multiplications. +This means that we are greatly underestimating the complexity of operations performed in the original floating-point QuartzNet model. +Assuming a n^2 growth in the cost of integer multiplication, we consider a 4 bit MAC 16x less expensive than a 16 bit one. +The number of MACs in the Fig. 2b is normalized with respect to 16 bit. +Also in this case, it is clear to see that the quantized versions are able to greatly reduce the amount of operations required, +with little-to-none degradation in accuracy. In particular, the 8 bit versions are already able to have a better WER and lower amount +of MACs compared to the grouped convolutions, and this is confirmed also by the 4 bit version, with a little degradation in terms of +WER. diff --git a/docs/img/JasperVertical4.jpg b/docs/img/JasperVertical4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d7364ec8a99f51e77b421c85a8da4eebe2883751 Binary files /dev/null and b/docs/img/JasperVertical4.jpg differ diff --git a/docs/img/QuartzNet.jpg b/docs/img/QuartzNet.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ce258fcd5f458caae606af0973c2eb14aea0af27 Binary files /dev/null and b/docs/img/QuartzNet.jpg differ diff --git a/docs/img/WERMB.jpg b/docs/img/WERMB.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3c1ce7d6bc3e378f6e75c204a01538f02a9cb007 Binary files /dev/null and b/docs/img/WERMB.jpg differ diff --git a/docs/img/WERNops.jpg b/docs/img/WERNops.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e539bb26077fb98f9a0f7b554ed63a18d57207a1 Binary files /dev/null and b/docs/img/WERNops.jpg differ diff --git a/docs/img/quartzPic1.jpg b/docs/img/quartzPic1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cec4829f2187d720be8589d075c83443eaaef69c Binary files /dev/null and b/docs/img/quartzPic1.jpg differ diff --git a/docs/img/rn50-ipi.png b/docs/img/rn50-ipi.png new file mode 100644 index 0000000000000000000000000000000000000000..504b011c9660b446ae39d407a8ce3d824bd2cd6a Binary files /dev/null and b/docs/img/rn50-ipi.png differ diff --git a/run-docker.sh b/run-docker.sh index f5c9f64b7d89e7def72c5b39131f37c22fcf57bf..e1ccb2a2a1c3270a97ee044013b8a267c905b5a3 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -65,44 +65,15 @@ SCRIPT=$(readlink -f "$0") # Absolute path this script is in, thus /home/user/bin SCRIPTPATH=$(dirname "$SCRIPT") -BREVITAS_REPO=https://github.com/Xilinx/brevitas.git -EXAMPLES_REPO=https://github.com/maltanar/brevitas_cnv_lfc.git -CNPY_REPO=https://github.com/rogersce/cnpy.git -#FINN_HLS_REPO=https://github.com/Xilinx/finn-hlslib.git -FINN_HLS_REPO=https://github.com/Tobi-Alonso/finn-hlslib.git -PYVERILATOR_REPO=https://github.com/maltanar/pyverilator -PYNQSHELL_REPO=https://github.com/maltanar/PYNQ-HelloWorld.git - -BREVITAS_LOCAL=$SCRIPTPATH/brevitas -EXAMPLES_LOCAL=$SCRIPTPATH/brevitas_cnv_lfc -CNPY_LOCAL=$SCRIPTPATH/cnpy -FINN_HLS_LOCAL=$SCRIPTPATH/finn-hlslib -PYVERILATOR_LOCAL=$SCRIPTPATH/pyverilator -PYNQSHELL_LOCAL=$SCRIPTPATH/PYNQ-HelloWorld BUILD_LOCAL=/tmp/$DOCKER_INST_NAME VIVADO_HLS_LOCAL=$VIVADO_PATH VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache -# clone dependency repos -git clone --branch feature/finn_onnx_export $BREVITAS_REPO $BREVITAS_LOCAL || git -C "$BREVITAS_LOCAL" pull -git clone $EXAMPLES_REPO $EXAMPLES_LOCAL || git -C "$EXAMPLES_LOCAL" checkout feature/rework_scaling_clipping; git -C "$EXAMPLES_LOCAL" pull -git clone $CNPY_REPO $CNPY_LOCAL || git -C "$CNPY_LOCAL" pull -git clone $FINN_HLS_REPO $FINN_HLS_LOCAL || git -C "$FINN_HLS_LOCAL" checkout master; git -C "$FINN_HLS_LOCAL" pull -git clone $PYVERILATOR_REPO $PYVERILATOR_LOCAL || git -C "$PYVERILATOR_LOCAL" pull -git clone $PYNQSHELL_REPO $PYNQSHELL_LOCAL || git -C "$PYNQSHELL_LOCAL" checkout feature/synth_rpt; git -C "$PYNQSHELL_LOCAL" pull - # ensure build dir exists locally mkdir -p $BUILD_LOCAL mkdir -p $VIVADO_IP_CACHE echo "Instance is named as $DOCKER_INST_NAME" -echo "Mounting $SCRIPTPATH into /workspace/finn" -echo "Mounting $SCRIPTPATH/brevitas into /workspace/brevitas" -echo "Mounting $SCRIPTPATH/brevitas_cnv_lfc into /workspace/brevitas_cnv_lfc" -echo "Mounting $SCRIPTPATH/cnpy into /workspace/cnpy" -echo "Mounting $SCRIPTPATH/finn-hlslib into /workspace/finn-hlslib" -echo "Mounting $SCRIPTPATH/pyverilator into /workspace/pyverilator" -echo "Mounting $SCRIPTPATH/PYNQ-HelloWorld into /workspace/PYNQ-HelloWorld" echo "Mounting $BUILD_LOCAL into $BUILD_LOCAL" echo "Mounting $VIVADO_PATH into $VIVADO_PATH" echo "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT" @@ -137,12 +108,6 @@ docker run -t --rm --name $DOCKER_INST_NAME -it \ -e "XILINX_VIVADO=$VIVADO_PATH" \ -e "SHELL=/bin/bash" \ -v $SCRIPTPATH:/workspace/finn \ --v $SCRIPTPATH/brevitas:/workspace/brevitas \ --v $SCRIPTPATH/brevitas_cnv_lfc:/workspace/brevitas_cnv_lfc \ --v $SCRIPTPATH/cnpy:/workspace/cnpy \ --v $SCRIPTPATH/finn-hlslib:/workspace/finn-hlslib \ --v $SCRIPTPATH/pyverilator:/workspace/pyverilator \ --v $SCRIPTPATH/PYNQ-HelloWorld:/workspace/PYNQ-HelloWorld \ -v $BUILD_LOCAL:$BUILD_LOCAL \ -v $VIVADO_PATH:$VIVADO_PATH \ -e VIVADO_PATH=$VIVADO_PATH \