diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 4374111f22a12e586c5c5233a7eee096b848b86e..00c25a4a3150a8368405b449fdce04456ccbe88d 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -1,17 +1,18 @@ name: DockerImage on: + pull_request: + branches: [ dev ] push: - branches: - - 'dev' + branches: [ dev ] jobs: docker: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml index ec92c84665d868b8a4376c82ecdf72395f1367a8..e2ba47ec296f73cfd7c0eede98bac3acd066075a 100644 --- a/.github/workflows/quicktest-dev-pr.yml +++ b/.github/workflows/quicktest-dev-pr.yml @@ -11,11 +11,11 @@ jobs: test: name: Run quicktest on PR branch - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: DockerRunQuicktest run: | diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index b3c669ec1097745bd30f650ca0b9dacda647c61d..dbafba247679895bcbaf385f0d33946c3f810945 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -84,7 +84,7 @@ RUN rm requirements.txt # extra Python package dependencies (for testing and interaction) RUN pip install pygments==2.4.1 RUN pip install ipykernel==5.5.5 -RUN pip install jupyter==1.0.0 +RUN pip install jupyter==1.0.0 --ignore-installed RUN pip install markupsafe==2.0.1 RUN pip install matplotlib==3.3.1 --ignore-installed RUN pip install pytest-dependency==0.5.1 diff --git a/docs/finn/source_code/finn.analysis.fpgadataflow.rst b/docs/finn/source_code/finn.analysis.fpgadataflow.rst index b52e994ee6033d4c3c1aae6400e20e103455d7b6..57472cb670b6fa6cb95e6c137458d3a522f82f5a 100644 --- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst +++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst @@ -30,6 +30,7 @@ finn.analysis.fpgadataflow.floorplan\_params :undoc-members: :show-inheritance: + finn.analysis.fpgadataflow.hls\_synth\_res\_estimation ------------------------------------------------------------- @@ -38,14 +39,15 @@ finn.analysis.fpgadataflow.hls\_synth\_res\_estimation :undoc-members: :show-inheritance: - finn.analysis.fpgadataflow.op\_and\_param\_counts - -------------------------------------------------- +finn.analysis.fpgadataflow.op\_and\_param\_counts +-------------------------------------------------- - .. automodule:: finn.analysis.fpgadataflow.op_and_param_counts +.. automodule:: finn.analysis.fpgadataflow.op_and_param_counts :members: :undoc-members: :show-inheritance: + finn.analysis.fpgadataflow.post\_synth\_res -------------------------------------------------- @@ -54,6 +56,7 @@ finn.analysis.fpgadataflow.post\_synth\_res :undoc-members: :show-inheritance: + finn.analysis.fpgadataflow.res\_estimation ------------------------------------------------- diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst index 4e3de458e153871d1d5969442af5940dc1673ecd..afa1ecffa08213db6a282076c6fdf59694f9e13e 100644 --- a/docs/finn/source_code/finn.core.rst +++ b/docs/finn/source_code/finn.core.rst @@ -37,6 +37,15 @@ qonnx.core.modelwrapper :undoc-members: :show-inheritance: +qonnx.core.onnx\_exec +--------------------------- + +.. automodule:: qonnx.core.onnx_exec + :members: + :undoc-members: + :show-inheritance: + + finn.core.onnx\_exec --------------------------- diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index cc56ea603e589d7000fe5b2b2943e67cdb90c884..fdcf44c6d99561658b727dc64c0a1b98b247c7df 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -8,7 +8,7 @@ HLS Custom Op Nodes Base Class ---------- -.. automodule:: finn.custom_op.fpgadataflow +.. automodule:: finn.custom_op.fpgadataflow.hlscustomop :members: :undoc-members: :show-inheritance: @@ -29,9 +29,25 @@ finn.custom\_op.fpgadataflow.channelwise\_op\_batch :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.checksum +-------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.checksum + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.concat +------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.concat + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.convolutioninputgenerator -------------------------------------------------------------- +-------------------------------------------------------- .. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator :members: @@ -46,6 +62,15 @@ finn.custom\_op.fpgadataflow.convolutioninputgenerator1d :undoc-members: :show-inheritance: + +finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl +------------------------------------------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.downsampler ----------------------------------------- @@ -62,6 +87,16 @@ finn.custom\_op.fpgadataflow.duplicatestreams\_batch :undoc-members: :show-inheritance: + +finn.custom\_op.fpgadataflow.eltwise +------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.eltwise + :members: + :undoc-members: + :show-inheritance: + + finn.custom\_op.fpgadataflow.fmpadding\_batch ----------------------------------------------- @@ -79,7 +114,7 @@ finn.custom\_op.fpgadataflow.globalaccpool\_batch :show-inheritance: finn.custom\_op.fpgadataflow.iodma ------------------------------------------------ +------------------------------------ .. automodule:: finn.custom_op.fpgadataflow.iodma :members: @@ -102,6 +137,15 @@ finn.custom\_op.fpgadataflow.lookup :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.matrixvectoractivation +----------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation + :members: + :undoc-members: + :show-inheritance: + + finn.custom\_op.fpgadataflow.pool\_batch ----------------------------------------------- @@ -127,14 +171,6 @@ finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.matrixvectoractivation ------------------------------------------------------------ - -.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation - :members: - :undoc-members: - :show-inheritance: - finn.custom\_op.fpgadataflow.streamingfifo ------------------------------------------------- diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst index 20d90a7bb596d6ce5638d9b2d9bae8a5c7e5c723..cdbe957c713ef6916e4ed7baabe09135f71fdeef 100644 --- a/docs/finn/source_code/finn.custom_op.rst +++ b/docs/finn/source_code/finn.custom_op.rst @@ -9,6 +9,7 @@ Submodules :maxdepth: 2 finn.custom_op.fpgadataflow + qonnx.custom_op.channels_last qonnx.custom_op.general Custom Op Nodes diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst index b1e7075bdcfb675a894f3e66b61d59117e4f078d..9f8ec079309f16daa022e14317ebddfd7758d639 100644 --- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst +++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst @@ -62,6 +62,14 @@ finn.transformation.fpgadataflow.create\_stitched\_ip :undoc-members: :show-inheritance: +finn.transformation.fpgadataflow.derive\_characteristic +------------------------------------------------------------ + +.. automodule:: finn.transformation.fpgadataflow.derive_characteristic + :members: + :undoc-members: + :show-inheritance: + finn.transformation.fpgadataflow.externalize\_params ------------------------------------------------------------ @@ -103,6 +111,17 @@ finn.transformation.fpgadataflow.insert\_fifo :undoc-members: :show-inheritance: + +finn.transformation.fpgadataflow.insert\_hook +---------------------------------------------------- + +.. automodule:: finn.transformation.fpgadataflow.insert_hook + :members: + :undoc-members: + :show-inheritance: + + + finn.transformation.fpgadataflow.insert\_iodma ---------------------------------------------------- diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst index 6a28eeedb2aa547ba80677864ae9fb8c6aa64097..f42b595a50ec90ef055e2818d66f4b2410c25594 100644 --- a/docs/finn/source_code/finn.transformation.rst +++ b/docs/finn/source_code/finn.transformation.rst @@ -20,7 +20,7 @@ Transformation Passes Base Class ---------- -.. automodule:: finn.transformation +.. automodule:: qonnx.transformation.base :members: :undoc-members: :show-inheritance: @@ -42,7 +42,7 @@ qonnx.transformation.bipolar\_to\_xnor :show-inheritance: qonnx.transformation.change\_3d\_tensors\_to\_4d ------------------------------------------------- +------------------------------------------------- .. automodule:: qonnx.transformation.change_3d_tensors_to_4d :members: @@ -57,8 +57,18 @@ qonnx.transformation.change\_datalayout :undoc-members: :show-inheritance: + +qonnx.transformation.channels\_last +-------------------------------------------- + +.. automodule:: qonnx.transformation.channels_last + :members: + :undoc-members: + :show-inheritance: + + qonnx.transformation.create\_generic\_partitions ------------------------------------------------- +------------------------------------------------- .. automodule:: qonnx.transformation.create_generic_partitions :members: @@ -171,13 +181,22 @@ qonnx.transformation.merge\_onnx\_models :show-inheritance: -finn.transformation.move\_reshape +qonnx.transformation.quant\_constant\_folding +---------------------------------------------- + +.. automodule:: qonnx.transformation.quant_constant_folding + :members: + :undoc-members: + :show-inheritance: + + +qonnx.transformation.rebalance\_conv ---------------------------------------- -.. automodule:: finn.transformation.move_reshape - :members: - :undoc-members: - :show-inheritance: +.. automodule:: qonnx.transformation.rebalance_conv + :members: + :undoc-members: + :show-inheritance: qonnx.transformation.remove ------------------------------------- @@ -186,3 +205,12 @@ qonnx.transformation.remove :members: :undoc-members: :show-inheritance: + + +finn.transformation.move\_reshape +---------------------------------------- + +.. automodule:: finn.transformation.move_reshape + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst index 8dffa016327c3bbe50f21278c859c83556b2b213..7ba3b252abfa0086a8c0281eb9a792fb239d6ec3 100644 --- a/docs/finn/source_code/finn.util.rst +++ b/docs/finn/source_code/finn.util.rst @@ -14,6 +14,15 @@ qonnx.util.basic :show-inheritance: +qonnx.util.cleanup +---------------------- + +.. automodule:: qonnx.util.cleanup + :members: + :undoc-members: + :show-inheritance: + + qonnx.util.config -------------------- @@ -22,6 +31,40 @@ qonnx.util.config :undoc-members: :show-inheritance: +qonnx.util.exec\_qonnx +---------------------- + +.. automodule:: qonnx.util.exec_qonnx + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.inference\_cost +-------------------------- + +.. automodule:: qonnx.util.inference_cost + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.onnx +------------------- + +.. automodule:: qonnx.util.onnx + :members: + :undoc-members: + :show-inheritance: + + +qonnx.util.to\_channels\_last +------------------------------ + +.. automodule:: qonnx.util.to_channels_last + :members: + :undoc-members: + :show-inheritance: + + finn.util.basic ---------------------- @@ -64,6 +107,15 @@ finn.util.gdrive :undoc-members: :show-inheritance: +finn.util.hls +--------------- + +.. automodule:: finn.util.hls + :members: + :undoc-members: + :show-inheritance: + + finn.util.imagenet ----------------------------- @@ -72,14 +124,6 @@ finn.util.imagenet :undoc-members: :show-inheritance: -qonnx.util.onnx ---------------------- - -.. automodule:: qonnx.util.onnx - :members: - :undoc-members: - :show-inheritance: - finn.util.platforms -------------------- diff --git a/docs/finn/source_code/modules.rst b/docs/finn/source_code/modules.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/finn/source_code/qonnx.custom_op.channels_last.rst b/docs/finn/source_code/qonnx.custom_op.channels_last.rst new file mode 100644 index 0000000000000000000000000000000000000000..3ad10d94a6b34a99e2213994a75b0f063fd3d36f --- /dev/null +++ b/docs/finn/source_code/qonnx.custom_op.channels_last.rst @@ -0,0 +1,41 @@ +************************** +Custom Op - Channels Last +************************** + +Channels Last Custom Ops +========================= + +qonnx.custom\_op.channels\_last.base\_wrapped\_op +-------------------------------------------------- + +.. automodule:: qonnx.custom_op.channels_last.base_wrapped_op + :members: + :undoc-members: + :show-inheritance: + + +qonnx.custom\_op.channels\_last.batch\_normalization +------------------------------------------------------ + +.. automodule:: qonnx.custom_op.channels_last.batch_normalization + :members: + :undoc-members: + :show-inheritance: + + +qonnx.custom\_op.channels\_last.conv +-------------------------------------- + +.. automodule:: qonnx.custom_op.channels_last.conv + :members: + :undoc-members: + :show-inheritance: + + +qonnx.custom\_op.channels\_last.max\_pool +------------------------------------------ + +.. automodule:: qonnx.custom_op.channels_last.max_pool + :members: + :undoc-members: + :show-inheritance: diff --git a/fetch-repos.sh b/fetch-repos.sh index b0f6400ed142b203b1c9f6d7ea4ac6ababcf34d1..5e668e04499fcf825382dc2785a92dc01c0e7d88 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,7 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6" +QONNX_COMMIT="7d50273a4dcccb445fb06f57f6bedc17b3707b35" FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366" BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" diff --git a/requirements.txt b/requirements.txt index 9038a5e8170301421529e0b570482316e4fff20a..348b1afab9deca1547d40cb8d8c54a396befa65d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ bitstring==3.1.7 clize==4.1.1 dataclasses-json==0.5.7 docrep==0.2.7 -future==0.18.2 gspread==3.6.0 numpy==1.22.0 onnx==1.11.0 @@ -10,6 +9,7 @@ onnxoptimizer onnxruntime==1.11.1 pre-commit==2.9.2 protobuf==3.20.2 +psutil==5.9.4 pyscaffold==3.2.1 scipy==1.5.2 setupext-janitor>=1.1.2 diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index d3c4156d9b4ccf601d3eea348f6cb61c0d9a6e87..80934d812fd7a165fdaba9c3fb17dc37e5a82d49 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -259,6 +259,10 @@ class DataflowBuildConfig: AutoFIFOSizingMethod ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM + #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test + #: if set to True, always using Python instead + force_python_rtlsim: Optional[bool] = False + #: Memory resource type for large FIFOs #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 5da608c27def8136f9ad11f62b4707452eac3120..956b4fd3be863d74e5c33de2faf69b56ebef9406 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -30,6 +30,7 @@ import json import numpy as np import os import shutil +import warnings from copy import deepcopy from distutils.dir_util import copy_tree from qonnx.core.modelwrapper import ModelWrapper @@ -113,6 +114,7 @@ from finn.util.basic import ( get_rtlsim_trace_depth, pyverilate_get_liveness_threshold_cycles, ) +from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent @@ -531,11 +533,20 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) elif cfg.auto_fifo_strategy == "largefifo_rtlsim": + # multi-in/out streams currently not supported in our C++ verilator driver + model_multi_io = len(model.graph.input) > 1 or len(model.graph.output) > 1 + force_python_sim = model_multi_io or cfg.force_python_rtlsim + if model_multi_io: + warnings.warn( + "Multi-in/out streams currently not supported " + + "in FINN C++ verilator driver, falling back to Python" + ) model = model.transform( InsertAndSetFIFODepths( cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), vivado_ram_style=cfg.large_fifo_mem_style, + force_python_sim=force_python_sim, ) ) else: @@ -632,20 +643,48 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi # prepare ip-stitched rtlsim rtlsim_model = deepcopy(model) rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg) - # run with single input to get latency - orig_rtlsim_trace_depth = get_rtlsim_trace_depth() + # multi-in/out streams currently not supported in our C++ verilator driver + model_multi_io = ( + len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1 + ) + force_python_rtlsim = cfg.force_python_rtlsim or model_multi_io + if model_multi_io: + warnings.warn( + "Multi-in/out streams currently not supported " + + "in FINN C++ verilator driver, falling back to Python" + ) rtlsim_bs = int(cfg.rtlsim_batch_size) - assert rtlsim_bs > 0, "rtlsim batch size must be >0" - if cfg.verify_save_rtlsim_waveforms: - # set depth to 3 for layer-by-layer visibility - os.environ["RTLSIM_TRACE_DEPTH"] = "3" + if force_python_rtlsim: + # run with single input to get latency + orig_rtlsim_trace_depth = get_rtlsim_trace_depth() + assert rtlsim_bs > 0, "rtlsim batch size must be >0" + if cfg.verify_save_rtlsim_waveforms: + # set depth to 3 for layer-by-layer visibility + os.environ["RTLSIM_TRACE_DEPTH"] = "3" + rtlsim_model.set_metadata_prop( + "rtlsim_trace", + "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs), + ) rtlsim_model.set_metadata_prop( - "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs) + "extra_verilator_args", str(["-CFLAGS", "-O3"]) ) - rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"])) - rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs) - rtlsim_latency = rtlsim_perf_dict["cycles"] - rtlsim_perf_dict["latency_cycles"] = rtlsim_latency + rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs) + rtlsim_latency = rtlsim_perf_dict["cycles"] + rtlsim_perf_dict["latency_cycles"] = rtlsim_latency + else: + rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs) + # keep keys consistent between the Python and C++-styles + cycles = rtlsim_perf_dict["cycles"] + clk_ns = float(model.get_metadata_prop("clk_ns")) + fclk_mhz = 1 / (clk_ns * 0.001) + runtime_s = (cycles * clk_ns) * (10**-9) + rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000 + rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s + rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz + for (key, val) in rtlsim_perf_dict.items(): + if "max_count" in key: + del rtlsim_perf_dict[key] + with open(report_dir + "/rtlsim_performance.json", "w") as f: json.dump(rtlsim_perf_dict, f, indent=2) if cfg.verify_save_rtlsim_waveforms: diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index f307be95c30d822dfc517e4c331bd8d82d727997..d1326607aa0dc5c34eef105b2ceb8ed86c1a0458 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -43,6 +43,7 @@ from finn.util.basic import ( pyverilate_get_liveness_threshold_cycles, ) from finn.util.hls import CallHLS +from finn.util.pyverilator import make_single_source_file from . import templates @@ -174,7 +175,7 @@ class HLSCustomOp(CustomOp): # default impl only returns the HLS verilog codegen dir return [verilog_path] - def get_all_verilog_filenames(self): + def get_all_verilog_filenames(self, abspath=False): "Return list of all Verilog files used for this node." verilog_files = [] @@ -182,7 +183,10 @@ class HLSCustomOp(CustomOp): for verilog_path in verilog_paths: for f in os.listdir(verilog_path): if f.endswith(".v"): - verilog_files += [f] + if abspath: + verilog_files += [verilog_path + "/" + f] + else: + verilog_files += [f] return verilog_files def prepare_rtlsim(self): @@ -192,13 +196,18 @@ class HLSCustomOp(CustomOp): if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") - verilog_paths = self.get_all_verilog_paths() - verilog_files = self.get_all_verilog_filenames() + + verilog_files = self.get_all_verilog_filenames(abspath=True) + single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") + tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" + make_single_source_file(verilog_files, target_file) + # build the Verilator emu library sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, + self.get_verilog_top_module_name() + ".v", + build_dir=tmp_build_dir, + verilog_path=[single_src_dir], trace_depth=get_rtlsim_trace_depth(), top_module_name=self.get_verilog_top_module_name(), ) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index df9d1f1e70674f7bc91460e154f4e24af08df79c..72128fda4cfe23db4858fe3ffe80a755733954cc 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -576,6 +576,10 @@ class MatrixVectorActivation(HLSCustomOp): def minimize_accumulator_width(self, model): weights = model.get_initializer(self.onnx_node.input[1]) + # since in the calculation the values of the weight matrix are used, + # for the bipolar case they need to be converted to bipolar + if self.get_nodeattr("binaryXnorMode"): + weights = 2 * weights - 1 if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) else: @@ -702,10 +706,12 @@ class MatrixVectorActivation(HLSCustomOp): of weights. Arguments: + * weights : numpy array with weights to be put into the file * weight_file_mode : one of {hls_header, decoupled_verilog_dat, decoupled_runtime} * weight_file_name : filename for the weight file to be generated + """ # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py index 91cd537baeff0c7666bbf3596b46a7412ec2fe4e..813f13e504eae181f4398eccbe40ad66b6e3bf16 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -42,12 +42,13 @@ class Pool_Batch(HLSCustomOp): Output shape (BatchSize,OutImgDim,OutImgDim,Channels) Notes: - # The input shape was chosen to be compatible with im2col (only true when there - is not folding). - # The actual data layout produced by the hlslib kernels is different - for depthwise ops. - * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + * The input shape was chosen to be compatible with im2col (only true when there + is not folding). + * The actual data layout produced by the hlslib kernels is different + for depthwise ops. + + * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) Channels can be folded using PE (SIMD from the input perspective) """ diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index ec5742376798fa7a31ab1e3e6a13ed1a4a0607ac..03cfbe0e09be092cf9446f29110b01ef0cc1e367 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -354,10 +354,12 @@ class Thresholding_Batch(HLSCustomOp): run-time reconfig of weights. Arguments: + * weights : numpy array with weights to be put into the file * weight_file_mode : one of {hls_header, decoupled_verilog_dat, decoupled_runtime} * weight_file_name : filename for the weight file to be generated + """ threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) tdt = self.get_weight_datatype() diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index a411d245a9a397e6d827abfa8ee4a784f207ecd5..d5e29ca22acf89440c3c3a66101bec89d4a66d46 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -410,10 +410,12 @@ class VectorVectorActivation(HLSCustomOp): of weights. Arguments: + * weights : numpy array with weights to be put into the file * weight_file_mode : one of {hls_header, decoupled_verilog_dat, decoupled_runtime} * weight_file_name : filename for the weight file to be generated + """ # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) diff --git a/src/finn/qnn-data/cpp/verilator_fifosim.cpp b/src/finn/qnn-data/cpp/verilator_fifosim.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d0aca9efe77806d31192f35a1d751b32116218f8 --- /dev/null +++ b/src/finn/qnn-data/cpp/verilator_fifosim.cpp @@ -0,0 +1,197 @@ +/* Copyright (C) 2022, Advanced Micro Devices, Inc. +All rights reserved. +# +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +# +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +# +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +# +* Neither the name of FINN nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +# +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +#include <iostream> +#include <fstream> +#include <cstddef> +#include <chrono> +#include "verilated.h" +#include "verilated_vcd_c.h" +#include "Vfinn_design_wrapper.h" + +#ifdef DEBUG +#define TRACE(x) x +#else +#define TRACE(x) ; +#endif + +using namespace std; + +Vfinn_design_wrapper * top; + +// code taken from pyverilator_wrapper.cpp generated by PyVerilator + +// this is required by verilator for verilog designs using $time +// main_time is incremented in eval +double main_time = 0; + +double sc_time_stamp() { +return main_time; +} +// function definitions +// helper functions for basic verilator tasks +extern "C" { //Open an extern C closed below +Vfinn_design_wrapper* construct() { + Verilated::commandArgs(0, (const char**) nullptr); + TRACE(Verilated::traceEverOn(true)); + Vfinn_design_wrapper* top = new Vfinn_design_wrapper(); + return top; +} +int eval(Vfinn_design_wrapper* top) { + top->eval(); + main_time++; + return 0; +} +int destruct(Vfinn_design_wrapper* top) { + if (top != nullptr) { + delete top; + top = nullptr; + } + return 0; +} + +TRACE( +VerilatedVcdC* tfp; +VerilatedVcdC* start_vcd_trace(Vfinn_design_wrapper* top, const char* filename) { + VerilatedVcdC* tfp = new VerilatedVcdC; + top->trace(tfp, 99); + tfp->open(filename); + return tfp; +} +int add_to_vcd_trace(VerilatedVcdC* tfp, int time) { + tfp->dump(time); + return 0; +} +int flush_vcd_trace(VerilatedVcdC* tfp) { + tfp->flush(); + return 0; +} +int stop_vcd_trace(VerilatedVcdC* tfp) { + tfp->close(); + return 0; +} +) + +} + +// end of code taken from pyverilator_wrapper.cpp generated by PyVerilator + +inline void toggle_clk() { + eval(top); + top->ap_clk = 1; + TRACE(add_to_vcd_trace(tfp, main_time)); + eval(top); + top->ap_clk = 0; + TRACE(add_to_vcd_trace(tfp, main_time)); +} + + +void reset() { + top->ap_rst_n = 0; + for(unsigned i = 0; i < 10; i++) { + toggle_clk(); + } + top->ap_rst_n = 1; +} + +int main(int argc, char *argv[]) { + top = construct(); + TRACE(tfp = start_vcd_trace(top, "trace.vcd")); + unsigned n_iters_per_input = @ITERS_PER_INPUT@; + unsigned n_iters_per_output = @ITERS_PER_OUTPUT@; + unsigned n_inputs = @N_INPUTS@; + unsigned max_iters = @MAX_ITERS@; + + reset(); + + top->m_axis_0_tready = 1; + top->s_axis_0_tvalid = 1; + + unsigned n_in_txns = 0, n_out_txns = 0, iters = 0, last_output_at = 0; + unsigned latency = 0; + + bool exit_criterion = false; + + cout << "Simulation starting" << endl; + cout << "Number of inputs to write " << n_iters_per_input * n_inputs << endl; + cout << "Number of outputs to expect " << n_iters_per_output * n_inputs << endl; + cout << "No-output timeout clock cycles " << max_iters << endl; + + chrono::steady_clock::time_point begin = chrono::steady_clock::now(); + + while(!exit_criterion) { + toggle_clk(); + iters++; + if(iters % 1000 == 0) { + cout << "Elapsed iters " << iters << " inps " << n_in_txns << " outs " << n_out_txns << endl; + chrono::steady_clock::time_point end = chrono::steady_clock::now(); + cout << "Elapsed since last report = " << chrono::duration_cast<chrono::seconds>(end - begin).count() << "[s]" << endl; + begin = end; + } + if(top->s_axis_0_tready == 1 && top->s_axis_0_tvalid == 1) { + n_in_txns++; + if(n_in_txns == n_iters_per_input * n_inputs) { + top->s_axis_0_tvalid = 0; + cout << "All inputs written at cycle " << iters << endl; + } + } + if(top->m_axis_0_tvalid == 1) { + n_out_txns++; + last_output_at = iters; + if(n_out_txns == n_iters_per_output) { + latency = iters; + } + } + + exit_criterion = ((n_in_txns >= n_iters_per_input * n_inputs) && (n_out_txns >= n_iters_per_output * n_inputs)) || ((iters-last_output_at) > max_iters); + } + + TRACE(flush_vcd_trace(tfp)); + TRACE(stop_vcd_trace(tfp)); + + cout << "Simulation finished" << endl; + cout << "Number of inputs consumed " << n_in_txns << endl; + cout << "Number of outputs produced " << n_out_txns << endl; + cout << "Number of clock cycles " << iters << endl; + + ofstream results_file; + results_file.open("results.txt", ios::out | ios::trunc); + results_file << "N_IN_TXNS" << "\t" << n_in_txns << endl; + results_file << "N_OUT_TXNS" << "\t" << n_out_txns << endl; + results_file << "cycles" << "\t" << iters << endl; + results_file << "N" << "\t" << n_inputs << endl; + results_file << "latency_cycles" << "\t" << latency << endl; +@FIFO_DEPTH_LOGGING@ + results_file.close(); + + + + destruct(top); + + return 0; +} diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 38c2d60c9a09644ceec07e11bf91315ee743e02c..0546643d1220603d40651c45a0c4032dcf5cfaaf 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -71,13 +71,15 @@ class InsertFIFO(Transformation): of the subsequent node. max() of these two values sets the FIFO depth. Constructor arguments: - - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of - Verilog FIFOs (Q_srl.v) - - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for - large FIFOs implemented by Vivado - - create_shallow_fifos : Normally, shallow-depth (<=2) FIFOs won't be created since - HLS streaming interfaces already have a degree of buffering. - Override with this parameter. + + :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP + instead of Verilog FIFOs (Q_srl.v) + :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute + to be used for large FIFOs implemented by Vivado + :parameter create_shallow_fifos: Normally, shallow-depth (<=2) FIFOs + won't be created since HLS streaming interfaces + already have a degree of buffering. + Override with this parameter. The other node attributes necessary to create a FIFO node are taken from the diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 5b3ead6d67d1f921ca070c01e976cbff43d7962a..80f5d9a09439a4ba4a7409a972987dc09acd8031 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -42,7 +42,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.util.fpgadataflow import is_fpgadataflow_node -from finn.util.pyverilator import pyverilate_stitched_ip +from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim def reset_implementation(node): @@ -72,8 +72,9 @@ def optimize_depth(depth): # Q_srl FIFOs do not benefit from size < 32 # add some slack return 32 - # round to nearest power of two for Vivado IP FIFO implementation - return int(2 ** math.ceil(math.log2(depth))) + # otherwise leave as is + # will be rounded to nearest power of two for Vivado-style FIFO + return int(depth) class RemoveShallowFIFOs(Transformation): @@ -125,14 +126,17 @@ class CapConvolutionFIFODepths(Transformation): constructor flag is set. Constructor arguments: - - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of - Verilog FIFOs (Q_srl.v) + + :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP + instead of Verilog FIFOs (Q_srl.v) Assumed input graph properties: + - all nodes are fpgadataflow nodes - FIFOs inserted with InsertAndSetFIFODepths Output: + - graph with smaller-depth FIFOs for convolutions Background: @@ -188,22 +192,25 @@ class InsertAndSetFIFODepths(Transformation): throughput in the created accelerator. Constructor arguments: - - clk_ns : clock period (used for IP preparation) - - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of - Verilog FIFOs (Q_srl.v) - - max_depth : how deep the "max"-sized FIFOs initially inserted will be - if set to None, use the tensor size as the depth - - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs - smaller where appropriate - - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for - large FIFOs implemented by Vivado afterwards + + :parameter clk_ns: clock period (used for IP preparation) + :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP + instead of Verilog FIFOs (Q_srl.v) + :parameter max_depth: how deep the "max"-sized FIFOs initially inserted + will be. If set to None, use the tensor size as the depth + :parameter swg_exception: call CapConvolutionFIFODepths to make convolution FIFOs + smaller where appropriate + :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute to be used + for large FIFOs implemented by Vivado afterwards Assumed input graph properties: + - all nodes are fpgadataflow nodes - no FIFOs inserted, - (inFIFODepths/outFIFODepths attrs will be ignored) Output: + - graph with appropriate-depth FIFOs inserted Background: @@ -211,12 +218,14 @@ class InsertAndSetFIFODepths(Transformation): necessary to insert FIFOs between them to prevent stalls due to bursty behavior. The sizes of those FIFOs are hard to predict analytically, so we do the following: + - insert deep (=tensor size) FIFOs between all fpgadataflow nodes - create stitched design - run through rtlsim with stream of multiple random input images (to fill pipeline) - keep track of observed maximum occupancy for each FIFO during rtlsim - when sim finished, update each FIFO depth to maximum observed occupancy and set inFIFODepths/outFIFODepths attrs to 0 on relevant nodes + """ def __init__( @@ -227,6 +236,7 @@ class InsertAndSetFIFODepths(Transformation): max_depth=None, swg_exception=True, vivado_ram_style="auto", + force_python_sim=False, ): super().__init__() self.fpgapart = fpgapart @@ -235,6 +245,7 @@ class InsertAndSetFIFODepths(Transformation): self.max_depth = max_depth self.swg_exception = swg_exception self.vivado_ram_style = vivado_ram_style + self.force_python_sim = force_python_sim def apply(self, model): # these optypes may potentially use external weights @@ -306,57 +317,75 @@ class InsertAndSetFIFODepths(Transformation): model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") - # calculate input frequency (number of cycles for each input word) - first_node = getCustomOp(model.graph.node[0]) - ncycles_per_input = max( - 1, - int( - math.ceil( - perf["max_cycles"] - / ( - np.prod(first_node.get_folded_input_shape()) - / first_node.get_folded_input_shape()[-1] + if self.force_python_sim: + # do rtlsim in Python for FIFO sizing + # calculate input frequency (number of cycles for each input word) + first_node = getCustomOp(model.graph.node[0]) + ncycles_per_input = max( + 1, + int( + math.ceil( + perf["max_cycles"] + / ( + np.prod(first_node.get_folded_input_shape()) + / first_node.get_folded_input_shape()[-1] + ) ) - ) - ), - ) + ), + ) - # set sufficiently large threshold for 1 image to fully execute and exit - ncycles = int(latency + max_cycles) + # set sufficiently large threshold for 1 image to fully execute and exit + ncycles = int(latency + max_cycles) - # prepare pyverilator model - sim = pyverilate_stitched_ip(model) + # prepare pyverilator model + sim = pyverilate_stitched_ip(model) - reset_rtlsim(sim) - toggle_clk(sim) + reset_rtlsim(sim) + toggle_clk(sim) - # set all input valids to 0 and output readies to 1 - # set input data to some constant - set_signal(sim, "tvalid", 0) - set_signal(sim, "tready", 1) - set_signal(sim, "tdata", 0) + # set all input valids to 0 and output readies to 1 + # set input data to some constant + set_signal(sim, "tvalid", 0) + set_signal(sim, "tready", 1) + set_signal(sim, "tdata", 0) + + output_detected = False + while ncycles > 0: + toggle_clk(sim) + # set/unset valids + if ncycles % ncycles_per_input == 0: + set_signal(sim, "tvalid", 1) + else: + set_signal(sim, "tvalid", 0) - output_detected = False - while ncycles > 0: - toggle_clk(sim) - # set/unset valids - if ncycles % ncycles_per_input == 0: - set_signal(sim, "tvalid", 1) - else: - set_signal(sim, "tvalid", 0) + # since latency estimation is very pessimistic, detect first output + # and fast-forward the sim + if get_signal(sim, "tvalid") != 0 and not output_detected: + ncycles = max_cycles + output_detected = True + else: + ncycles = ncycles - 1 - # since latency estimation is very pessimistic, detect first output - # and fast-forward the sim - if get_signal(sim, "tvalid") != 0 and not output_detected: - ncycles = max_cycles - output_detected = True + if not output_detected: + warnings.warn( + "No output detected, calculated FIFO depths may not be correct" + ) + else: + # do rtlsim in C++ for FIFO sizing + # determine # inputs for FIFO sizing according to topology type + swg_nodes = [ + x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type + ] + if len(swg_nodes) == 0: + # MLP, no layer overlap + # assuming half the nodes are now FIFOs, use half the # of + # nodes as # inputs to drive the imulation + n_inputs = int(len(model.graph.node) / 2) else: - ncycles = ncycles - 1 - - if not output_detected: - warnings.warn( - "No output detected, calculated FIFO depths may not be correct" - ) + # convnet, single input is typically enough to fill entire + # layer pipeline due to overlaps + n_inputs = 1 + sim = verilator_fifosim(model, n_inputs) for ind, node in enumerate(fifo_nodes): maxcount_name = "maxcount_%d" % ind diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index e24e24f1f8ebb2873c81617884cd333311d8aea9..2301fccdd4fff6310340ffe1dd8de7732a4f9bd4 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -62,17 +62,20 @@ class SetFolding(Transformation): Notable exceptions and special behavior: - * When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation), + When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation), which have two attributes (PE and SIMD): - * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max - (configurable in the SetFolding initializer, defaults to 36) - * then increases PE until the target is met or max PE reached - * When folding depthwise convolutions ("VVAU"/VectorVectorActivation) + * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max + (configurable in the SetFolding initializer, defaults to 36) + * then increases PE until the target is met or max PE reached + + When folding depthwise convolutions ("VVAU"/VectorVectorActivation) or spatial reduction ops (Pool_Batch): - * the producer of the node is expected to be a ConvolutionInputGenerator - with depthwise=1, whose SIMD value will be set equal to the PE value of - its consumer node + + * the producer of the node is expected to be a ConvolutionInputGenerator + with depthwise=1, whose SIMD value will be set equal to the PE value of + its consumer node + """ def __init__( diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index 97da4d41524e7c86fb5a73375d9ea2c2c9aa10dc..e0a5666000fc2aa9599bb7475c1b8dd37489afac 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -358,16 +358,16 @@ class VitisBuild(Transformation): """Best-effort attempt at building the accelerator with Vitis. It assumes the model has only fpgadataflow nodes - fpga_part: string identifying the target FPGA - period_ns: target clock period - platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"] - strategy: Vitis optimization strategy - enable_debug: add Chipscope to all AXI interfaces - floorplan_file: path to a JSON containing a dictionary with SLR assignments - for each node in the ONNX graph. Must be parse-able by - the ApplyConfig transform. - enable_link: enable linking kernels (.xo files), otherwise just synthesize - them independently. + :parameter fpga_part: string identifying the target FPGA + :parameter period_ns: target clock period + :parameter platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"] + :parameter strategy: Vitis optimization strategy + :parameter enable_debug: add Chipscope to all AXI interfaces + :parameter floorplan_file: path to a JSON containing a dictionary with + SLR assignments for each node in the ONNX graph. + Must be parse-able by the ApplyConfig transform. + :parameter enable_link: enable linking kernels (.xo files), + otherwise just synthesize them independently. """ def __init__( diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py index 967a1276365e4af1a6d617c081b9c04b4710da97..34f11d1e95e6bc3f6a36ce6d878ed493108b3ba6 100644 --- a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py +++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py @@ -56,12 +56,12 @@ class ConvertQONNXtoFINN(Transformation): is not converted to a MultiThreshold node. :param filter_function: Each candidate Quant and BinaryQant node is first evaluated - by this function. If the function returns False, - then the node is not converted to a MultiTrheshold node. - The function is given the model and candidate node as parameters. - Per default a filter function is inserted, which disables the conversion of - Quant nodes, which have a bit width of larger than 8. - Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8) + by this function. If the function returns False, + then the node is not converted to a MultiTrheshold node. + The function is given the model and candidate node as parameters. + Per default a filter function is inserted, which disables the conversion of + Quant nodes, which have a bit width of larger than 8. + Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8) """ def __init__( diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py index a50a5850779cadf7ab21b9c1c4dfdbb36232af42..9819086d826a51d1df5240d88c4fda8513cc9ba6 100644 --- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py +++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py @@ -52,9 +52,7 @@ class QuantActBaseHandler(ABC): self._q_node = quant_node self._q_index = quant_node_index - @property @classmethod - @abstractmethod def valid_predecessor_op_types(self): """Defines which op types the preceding node is allowed to have for this type of activation. @@ -284,9 +282,11 @@ class QuantReluHandler(QuantActBaseHandler): """Class for converting a quantized relu operation expressed in the QONNX dialect to the FINN ONNX dialect.""" - valid_predecessor_op_types = [ - "Relu", - ] + @classmethod + def valid_predecessor_op_types(self): + return [ + "Relu", + ] def _check_compatibility(self): if self._q_node.op_type == "Quant": @@ -391,15 +391,17 @@ class QuantIdentityHandler(QuantActBaseHandler): these are equivalent to quantized identity activations. """ - valid_predecessor_op_types = [ - "BatchNormalization", - "Sub", - "Add", - "Mul", - "Div", - "DebugMarker", - None, - ] + @classmethod + def valid_predecessor_op_types(self): + return [ + "BatchNormalization", + "Sub", + "Add", + "Mul", + "Div", + "DebugMarker", + None, + ] def _check_compatibility(self): # Gather parameters to check diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py index 77025ecdf57d5a422992d4163d05c740454986bb..48dda3820deb051bd8a291188f02fe7d1dd2cc0b 100644 --- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py +++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py @@ -30,7 +30,10 @@ import warnings from qonnx.transformation.base import Transformation -from finn.transformation.qonnx.qonnx_activation_handlers import QuantActBaseHandler +from finn.transformation.qonnx.qonnx_activation_handlers import ( + QuantActBaseHandler, + QuantIdentityHandler, +) def default_filter_function_generator(max_multithreshold_bit_width=8): @@ -66,8 +69,7 @@ def default_filter_function_generator(max_multithreshold_bit_width=8): class ConvertQuantActToMultiThreshold(Transformation): - """ - Converts Quant nodes in the activation path to MultiThreshold nodes. + """Converts Quant nodes in the activation path to MultiThreshold nodes. The optional keyword argument `filter_function` presents a way to control which Quant and BipolarQuant nodes in the activation path @@ -75,12 +77,12 @@ class ConvertQuantActToMultiThreshold(Transformation): is not converted to a MultiThreshold node. :param filter_function: Each candidate Quant and BinaryQant node is first evaluated - by this function. If the function returns False, - then the node is not converted to a MultiTrheshold node. - The function is given the model and candidate node as parameters. - Per default a filter function is inserted, which disables the conversion of - Quant nodes, which have a bit width of larger than 8. - Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8) + by this function. If the function returns False, + then the node is not converted to a MultiTrheshold node. + The function is given the model and candidate node as parameters. + Per default a filter function is inserted, which disables the conversion of + Quant nodes, which have a bit width of larger than 8. + Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8) """ def __init__( @@ -127,7 +129,7 @@ class ConvertQuantActToMultiThreshold(Transformation): # Check for possible ambiguity in handler selection valid_predecessors = [] for cls in QuantActBaseHandler.__subclasses__(): - valid_predecessors.extend(cls.valid_predecessor_op_types) + valid_predecessors.extend(cls.valid_predecessor_op_types()) if len(valid_predecessors) != len(set(valid_predecessors)): raise RuntimeError( "Two or more activation handlers declare the same " @@ -138,16 +140,15 @@ class ConvertQuantActToMultiThreshold(Transformation): # Try to find a fitting handler for this Quant activation node for handler_cls in QuantActBaseHandler.__subclasses__(): - if predecessor_op_type in handler_cls.valid_predecessor_op_types: + if predecessor_op_type in handler_cls.valid_predecessor_op_types(): handler = handler_cls(model, n, node_ind) break else: - raise ValueError( - f"Quant nodes in the activation path and with predecessor " - f"nodes of type {predecessor_op_type} are currently not " - f"supported by FINN and can not be converted to " - f"MultiThreshold nodes." - ) + # fall back to QuantIdentityHandler here + # it may still not work due to its particular restrictions, + # but better than just erroring out without trying + handler = QuantIdentityHandler(model, n, node_ind) + model = handler.replace_quant_node() graph_modified = True return (model, graph_modified) diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index 65478d2540b53443d3f74b44a22fde3defd8ca93..797dad32a2cfeb3e00e224f264d91b5ee0e9247b 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -265,7 +265,7 @@ def numpy_to_hls_code( # define a function to convert a single element into a C++ init string # a single element can be a hex string if we are using packing def elem2str(x): - if type(x) == str or type(x) == np.str_ or type(x) == np.str: + if type(x) == str or type(x) == np.str_: return '%s("%s", 16)' % (hls_dtype, x) elif type(x) == np.float32: if dtype.is_integer(): diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py index d7ed3e261fe024b7f054382f12184628d3f3e94c..a00899cf784cbe3985b942af6b3d9a4c14cd8706 100644 --- a/src/finn/util/pyverilator.py +++ b/src/finn/util/pyverilator.py @@ -28,33 +28,41 @@ import pkg_resources as pk +import numpy as np import os import shutil from pyverilator import PyVerilator +from qonnx.custom_op.registry import getCustomOp -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.basic import ( + get_rtlsim_trace_depth, + launch_process_helper, + make_build_dir, +) -def pyverilate_stitched_ip( - model, - read_internal_signals=True, - disable_common_warnings=True, - extra_verilator_args=[], -): - """Given a model with stitched IP, return a PyVerilator sim object. - Trace depth is also controllable, see get_rtlsim_trace_depth() +def make_single_source_file(filtered_verilog_files, target_file): + """Dump all Verilog code used by stitched IP into a single file. + This is because large models with many files require a verilator + command line too long for bash on most systems""" - :param read_internal_signals If set, it will be possible to examine the - internal (not only port) signals of the Verilog module, but this may - slow down compilation and emulation. + # concatenate all verilog code into a single file + with open(target_file, "w") as wf: + for vfile in filtered_verilog_files: + with open(vfile) as rf: + wf.write("//Added from " + vfile + "\n\n") + lines = rf.read() + for line in lines.split("\n"): + # break down too-long lines, Verilator complains otherwise + if len(line) > 20000: + line = line.replace("&", "\n&") + wf.write("\n" + line) - :param disable_common_warnings If set, disable the set of warnings that - Vivado-HLS-generated Verilog typically triggers in Verilator - (which can be very verbose otherwise) - """ - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") +def prepare_stitched_ip_for_verilator(model): + """Prepare sources from given stitched IP for verilator simulation, including + generating a single source file and replacing certain Vivado infrastructure + headers with Verilator-compatible ones""" vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj") with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f: @@ -67,8 +75,6 @@ def pyverilate_stitched_ip( return os.path.basename(os.path.realpath(x)) top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename")) - top_module_name = top_module_file_name.strip(".v") - build_dir = make_build_dir("pyverilator_ipstitched_") # dump all Verilog code to a single file # this is because large models with many files require @@ -79,7 +85,7 @@ def pyverilate_stitched_ip( # remove duplicates from list by doing list -> set -> list src_exts = [".v", ".sv"] - all_verilog_src_files = list( + all_verilog_files = list( set( filter( lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs @@ -87,7 +93,9 @@ def pyverilate_stitched_ip( ) ) - verilog_header_dir = make_build_dir("pyverilator_vh_") + verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh" + os.makedirs(verilog_header_dir, exist_ok=True) + # use custom version of axis infrastructure vh # to enable Verilator to simulate AMD/Xilinx components (e.g DWC) custom_vh = pk.resource_filename( @@ -105,7 +113,7 @@ def pyverilate_stitched_ip( # remove all but one instances of regslice_core.v filtered_verilog_files = [] remove_entry = False - for vfile in all_verilog_src_files: + for vfile in all_verilog_files: if "regslice_core" in vfile: if not remove_entry: filtered_verilog_files.append(vfile) @@ -113,17 +121,159 @@ def pyverilate_stitched_ip( else: filtered_verilog_files.append(vfile) - # concatenate all verilog code into a single file - with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf: - for vfile in filtered_verilog_files: - with open(vfile) as rf: - wf.write("//Added from " + vfile + "\n\n") - lines = rf.read() - for line in lines.split("\n"): - # break down too-long lines, Verilator complains otherwise - if len(line) > 20000: - line = line.replace("&", "\n&") - wf.write("\n" + line) + target_file = vivado_stitch_proj_dir + "/" + top_module_file_name + make_single_source_file(filtered_verilog_files, target_file) + + return vivado_stitch_proj_dir + + +def verilator_fifosim(model, n_inputs, max_iters=100000000): + """Create a Verilator model of stitched IP and use a simple C++ + driver to drive the input stream. Useful for FIFO sizing, latency + and throughput measurement.""" + + vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model) + build_dir = make_build_dir("verilator_fifosim_") + fifosim_cpp_fname = pk.resource_filename( + "finn.qnn-data", "cpp/verilator_fifosim.cpp" + ) + with open(fifosim_cpp_fname, "r") as f: + fifosim_cpp_template = f.read() + assert len(model.graph.input) == 1, "Only a single input stream is supported" + assert len(model.graph.output) == 1, "Only a single output stream is supported" + iname = model.graph.input[0].name + first_node = model.find_consumer(iname) + oname = model.graph.output[0].name + last_node = model.find_producer(oname) + assert (first_node is not None) and ( + last_node is not None + ), "Failed to find first/last nodes" + fnode_inst = getCustomOp(first_node) + lnode_inst = getCustomOp(last_node) + ishape_folded = fnode_inst.get_folded_input_shape() + oshape_folded = lnode_inst.get_folded_output_shape() + + fifo_log = [] + fifo_log_templ = ' results_file << "maxcount%s" << "\\t" ' + fifo_log_templ += "<< to_string(top->maxcount%s) << endl;" + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + fifo_ind = 0 + for fifo_node in fifo_nodes: + fifo_node = getCustomOp(fifo_node) + if fifo_node.get_nodeattr("depth_monitor") == 1: + suffix = "" if fifo_ind == 0 else "_%d" % fifo_ind + fifo_log.append(fifo_log_templ % (suffix, suffix)) + fifo_ind += 1 + fifo_log = "\n".join(fifo_log) + + template_dict = { + "ITERS_PER_INPUT": np.prod(ishape_folded[:-1]), + "ITERS_PER_OUTPUT": np.prod(oshape_folded[:-1]), + "N_INPUTS": n_inputs, + "MAX_ITERS": max_iters, + "FIFO_DEPTH_LOGGING": fifo_log, + } + + for (key, val) in template_dict.items(): + fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val)) + + with open(build_dir + "/verilator_fifosim.cpp", "w") as f: + f.write(fifosim_cpp_template) + + which_verilator = shutil.which("verilator") + if which_verilator is None: + raise Exception("'verilator' executable not found") + + verilator_args = [ + "perl", + which_verilator, + "-Wno-fatal", + "-Mdir", + build_dir, + "-y", + vivado_stitch_proj_dir, + "--CFLAGS", + "--std=c++11", + "-O3", + "--x-assign", + "fast", + "--x-initial", + "fast", + "--noassert", + "--cc", + "finn_design_wrapper.v", + "--top-module", + "finn_design_wrapper", + "--exe", + "verilator_fifosim.cpp", + "--threads", + "4", + ] + + proc_env = os.environ.copy() + gcc_args = "-O3 -march=native" + proc_env["OPT_FAST"] = gcc_args + make_args = [ + "make", + "-j4", + "-C", + build_dir, + "-f", + "Vfinn_design_wrapper.mk", + "Vfinn_design_wrapper", + ] + + with open(build_dir + "/compile.sh", "w") as f: + f.write("#!/bin/bash" + "\n") + f.write("export OPT_FAST='%s'\n" % gcc_args) + f.write(" ".join(verilator_args) + "\n") + f.write(" ".join(make_args) + "\n") + + launch_process_helper(verilator_args, cwd=build_dir) + launch_process_helper(make_args, proc_env=proc_env, cwd=build_dir) + + sim_launch_args = ["./Vfinn_design_wrapper"] + launch_process_helper(sim_launch_args, cwd=build_dir) + + with open(build_dir + "/results.txt", "r") as f: + results = f.read().strip().split("\n") + ret_dict = {} + for result_line in results: + key, val = result_line.split("\t") + ret_dict[key] = int(val) + return ret_dict + + +def pyverilate_stitched_ip( + model, + read_internal_signals=True, + disable_common_warnings=True, + extra_verilator_args=[], +): + """Given a model with stitched IP, return a PyVerilator sim object. + Trace depth is also controllable, see get_rtlsim_trace_depth() + + :param read_internal_signals If set, it will be possible to examine the + internal (not only port) signals of the Verilog module, but this may + slow down compilation and emulation. + + :param disable_common_warnings If set, disable the set of warnings that + Vivado-HLS-generated Verilog typically triggers in Verilator + (which can be very verbose otherwise) + + """ + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model) + verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh" + + def file_to_basename(x): + return os.path.basename(os.path.realpath(x)) + + top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename")) + top_module_name = top_module_file_name.strip(".v") + build_dir = make_build_dir("pyverilator_ipstitched_") verilator_args = [] # disable common verilator warnings that should be harmless but commonly occur diff --git a/src/finn/util/test.py b/src/finn/util/test.py index bfe4aa0bb826c73f6a7c67f025e24764da8c36cc..bd8bde2820fa87ed972d699cae905d7f6cc310ff 100644 --- a/src/finn/util/test.py +++ b/src/finn/util/test.py @@ -91,8 +91,8 @@ def soft_verify_topk(invec, idxvec, k): """Check that the topK indices provided actually point to the topK largest values in the input vector""" np_topk = np.flip(invec.flatten().argsort())[:k] - soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()] - soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()] + soft_expected = invec.flatten()[np_topk.astype(np.int_).flatten()] + soft_produced = invec.flatten()[idxvec.astype(np.int_).flatten()] return (soft_expected == soft_produced).all() diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py index aaeb3ab920d1d8fae79c1173582d18cf81d03063..1f77276d5a72e5f886d5f94af8d35121ccadd486 100644 --- a/src/finn/util/vcd.py +++ b/src/finn/util/vcd.py @@ -101,19 +101,21 @@ def get_stream_if_stats(vcd_file, if_base_name): <stream_state>: (<num_samples>, <fraction_of_time>), where <stream_state> is the combination of (V)alid/(R)eady values, - <num_samples> is the approximate number of rising clock edges spent in <state> - , and <fraction_of_time> is the fraction of <num_samples> to total + <num_samples> is the approximate number of rising clock edges spent in <state>, + and <fraction_of_time> is the fraction of <num_samples> to total amount of time recorded by the trace. Example: - {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061), - "{'V': 1, 'R': 0}": (0, 0.0), - "{'V': 0, 'R': 1}": (7605, 0.9218181818181819), - "{'V': 1, 'R': 1}": (640, 0.07757575757575758)} - + { + "{'V': 0, 'R': 0}": (5, 0.0006060606060606061), + "{'V': 1, 'R': 0}": (0, 0.0), + "{'V': 0, 'R': 1}": (7605, 0.9218181818181819), + "{'V': 1, 'R': 1}": (640, 0.07757575757575758) + } Here we can see the stream was transmitting values 7.7% of the time, and 9.2% of the time there was no incoming data (valid 0, ready 1) """ + if_valid = if_base_name + vname if_ready = if_base_name + rname v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True) diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py index 5fd1439bd055782692bac404622137e166ef5e07..116df98d17985951892cf7f43baf486ab4fd80c8 100644 --- a/tests/fpgadataflow/test_fifosizing.py +++ b/tests/fpgadataflow/test_fifosizing.py @@ -49,14 +49,19 @@ def fetch_test_model(topology, wbits=2, abits=2): @pytest.mark.slow @pytest.mark.vivado -@pytest.mark.fpgadataflow -def test_fifosizing_linear(): +@pytest.mark.parametrize( + "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"] +) +def test_fifosizing_linear(method): + force_python_rtlsim = "python" in method + method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize" tmp_output_dir = fetch_test_model("tfc") cfg = build_cfg.DataflowBuildConfig( output_dir=tmp_output_dir, auto_fifo_depths=True, - auto_fifo_strategy="characterize", + auto_fifo_strategy=method_key, target_fps=10000, + force_python_rtlsim=force_python_rtlsim, synth_clk_period_ns=10.0, board="Pynq-Z1", rtlsim_batch_size=100,