diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index f61af878ffadb6b4fd7a527110b105b19cee0ded..5f03379bbc37ab913f712571c630035dbad84cce 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Setup Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: '3.8'
 
diff --git a/fetch-repos.sh b/fetch-repos.sh
index ee1923e3a442f92d4d95b61f2a6e7410a0517d81..16960c71e31671b042dcfb4c31208aaaf8e29906 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -36,6 +36,7 @@ HLSLIB_COMMIT="d27f6b6c5d8f1bb208db395659389603f63ad4be"
 OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
+KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79"
 EXP_BOARD_FILES_MD5="30eecc497c31050bd46d10ea20eba232"
 
 QONNX_URL="https://github.com/fastmachinelearning/qonnx.git"
@@ -47,6 +48,7 @@ HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
 OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
 AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
 XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
+KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
 
 QONNX_DIR="qonnx"
 FINN_EXP_DIR="finn-experimental"
@@ -57,6 +59,7 @@ HLSLIB_DIR="finn-hlslib"
 OMX_DIR="oh-my-xilinx"
 AVNET_BDF_DIR="avnet-bdf"
 XIL_BDF_DIR="xil-bdf"
+KV260_SOM_BDF_DIR="kv260-som-bdf"
 
 # absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
@@ -104,6 +107,7 @@ fetch_board_files() {
     unzip -q pynq-z2.zip
     cp -r $SCRIPTPATH/deps/$AVNET_BDF_DIR/* $SCRIPTPATH/deps/board_files/
     cp -r $SCRIPTPATH/deps/$XIL_BDF_DIR/boards/Xilinx/rfsoc2x2 $SCRIPTPATH/deps/board_files/;
+    cp -r $SCRIPTPATH/deps/$KV260_SOM_BDF_DIR/boards/Xilinx/kv260_som $SCRIPTPATH/deps/board_files/;
     cd $OLD_PWD
 }
 
@@ -116,6 +120,7 @@ fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
 fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR
 fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR
 fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR
+fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR
 
 # download extra Pynq board files and extract if needed
 if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 80934d812fd7a165fdaba9c3fb17dc37e5a82d49..a38cb6e572d683871a924330742a1859b6fbe75d 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -253,6 +253,10 @@ class DataflowBuildConfig:
     #: for each FIFO.
     auto_fifo_depths: Optional[bool] = True
 
+    #: Whether FIFO nodes with depth larger than 32768 will be split.
+    #: Allow to configure very large FIFOs in the folding_config_file.
+    split_large_fifos: Optional[bool] = False
+
     #: When `auto_fifo_depths = True`, select which method will be used for
     #: setting the FIFO sizes.
     auto_fifo_strategy: Optional[
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index ce0ffb4771bad4a9818842f3f744c7a2b4207b8d..b0f7b6ec6cada69d402af9089c66636248150b19 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -99,6 +99,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import (
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
+    SplitLargeFIFOs,
 )
 from finn.transformation.fpgadataflow.set_folding import SetFolding
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
@@ -566,8 +567,6 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
             model = model.transform(ApplyConfig(cfg.folding_config_file))
-        # remove any shallow FIFOs
-        model = model.transform(RemoveShallowFIFOs())
 
     # extract the final configuration and save it as json
     hw_attrs = [
@@ -586,6 +585,13 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model, cfg.output_dir + "/final_hw_config.json", hw_attrs
     )
 
+    # perform FIFO splitting and shallow FIFO removal only after the final config
+    # json file has been written. otherwise, since these transforms may add/remove
+    # FIFOs, we get name mismatch problems when trying to reuse the final config.
+    if cfg.split_large_fifos:
+        model = model.transform(SplitLargeFIFOs())
+    model = model.transform(RemoveShallowFIFOs())
+
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
     model = model.transform(
diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py
index d6284750c73026c09fb7986ffc2517ed9ae3b153..68ed6546c741277bd8e962b6e80eda083cedba9c 100644
--- a/src/finn/custom_op/fpgadataflow/eltwise.py
+++ b/src/finn/custom_op/fpgadataflow/eltwise.py
@@ -398,7 +398,7 @@ class StreamingEltwise(HLSCustomOp):
                 "StreamingEltwise",
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PE"),
-                self.get_number_output_values(),
+                int(np.prod(self.get_folded_output_shape()[:-2])),
                 slice_in0,
                 slice_in1,
                 slice_out,
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 9f34eb15153219255ff2659fb01c8fcdb24c1b44..72128fda4cfe23db4858fe3ffe80a755733954cc 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -576,6 +576,10 @@ class MatrixVectorActivation(HLSCustomOp):
 
     def minimize_accumulator_width(self, model):
         weights = model.get_initializer(self.onnx_node.input[1])
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
         else:
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 1bb37c0fde0bfb17d47d6dd69721639281c5e66b..d9745acf63c4685b3369ac379abde0a6c5a3f157 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -602,13 +602,17 @@ class Thresholding_Batch(HLSCustomOp):
 
     # TODO check and add whatever missing
     def defines(self, var):
+        numReps = 1
         numInputVectors = list(self.get_nodeattr("numInputVectors"))
-        numReps = int(np.prod(numInputVectors))
+        total_spatial_size = int(np.prod(numInputVectors))
+
         self.code_gen_dict["$DEFINES$"] = [
-            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format(
+            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}\n
+               #define ImgDim1 {}""".format(
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PE"),
                 numReps,
+                total_spatial_size,
             )
         ]
         if self.get_nodeattr("mem_mode") == "decoupled":
@@ -649,7 +653,7 @@ class Thresholding_Batch(HLSCustomOp):
             npy_in = "%s/thresholds.npy" % code_gen_dir
 
             self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, ImgDim1);'
                 % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
             )
 
@@ -671,18 +675,13 @@ class Thresholding_Batch(HLSCustomOp):
 
     def docompute(self):
         tmpl_args = self.get_template_param_values()
-        # TODO: why put some template parameters into defines and not others?
-        # should ImgDim be defined or just filled in here like we do now?
         node = self.onnx_node
-        inp_vecs = self.get_nodeattr("numInputVectors")
-        total_spatial_size = int(np.prod(inp_vecs))
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}>
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}>
                 (in0, out, threshs, numReps);""".format(
                     node.op_type,
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
@@ -692,10 +691,9 @@ class Thresholding_Batch(HLSCustomOp):
             # - for cppsim the repetition comes from the threshold stream reader+input
             # - for synth the unit runs continuously anyway (ap_ctrl_none)
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
-                (in0, out, weights, 1);""".format(
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                (in0, out, weights, numReps);""".format(
                     "Thresholding_Stream_Batch",
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 9282d399f857fa9da6f685421ce46f1d9cc5e851..2619557edfb92059f0ac0d824f7e9c289b282612 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -29,10 +29,16 @@
 import math
 import numpy as np
 import warnings
+from onnx import TensorProto, helper
 from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk
+from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+)
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
@@ -484,3 +490,126 @@ class InsertAndSetFIFODepths(Transformation):
                 node_inst.set_nodeattr("outFIFODepths", fifodepth_out)
 
         return (model, False)
+
+
+def get_fifo_split_configs(depth, max_qsrl_depth=256, max_vivado_depth=32768):
+    """Break non-power-of-2 sized FIFO depths into several ones"""
+
+    def floor_pow2(x):
+        if (x & (x - 1) == 0) and x != 0:
+            return x
+        else:
+            return 1 << ((x - 1).bit_length() - 1)
+
+    def decompose_pow2(x):
+        if x <= max_qsrl_depth:
+            return [x]
+        else:
+            r = floor_pow2(x)
+            if x == r:
+                return [x]
+            else:
+                return [r, *decompose_pow2(x - r)]
+
+    ret = []
+    # trivial case: for small FIFOs, return as-is with rtl style
+    if depth <= max_qsrl_depth:
+        return [(depth, "rtl")]
+    # first pass: ensure max depth is respected
+    # (restricted by Vivado AXIS infra IP)
+    remainder = depth
+    while remainder != 0:
+        if remainder > max_vivado_depth:
+            ret.append(max_vivado_depth)
+            remainder -= max_vivado_depth
+        else:
+            ret.append(remainder)
+            remainder = 0
+    # second pass: break non-power-of-2 sized FIFOs
+    # into several ones
+
+    ret_pass2 = list(map(decompose_pow2, ret))
+    # unpack list of lists
+    ret_pass2 = [x for dec_list in ret_pass2 for x in dec_list]
+
+    # finally, add impl_style to each split FIFO
+    ret_final = []
+    for cand_depth in ret_pass2:
+        if cand_depth <= max_qsrl_depth:
+            ret_final.append((cand_depth, "rtl"))
+        else:
+            ret_final.append((cand_depth, "vivado"))
+
+    return ret_final
+
+
+class SplitLargeFIFOs(Transformation):
+    """Split large FIFOs before implementation, for two reasons:
+
+    - impl_style="vivado" supports a max depth of 32k. Any larger
+      FIFOs must be implemented as a sequence of smaller FIFOs.
+    - impl_style="vivado" requires power-of-two depths, which is
+      normally handled by rounding up to the nearest power-of-two.
+      So a FIFO of size 8196 normally gets rounded-up to a depth of
+      16384 and wastes a lot of resources. Here, instead, we split
+      this up into two FIFOs of depth 8192 + 4.
+
+    """
+
+    def __init__(self, max_qsrl_depth=256, max_vivado_depth=32768):
+        super().__init__()
+        self.max_qsrl_depth = max_qsrl_depth
+        self.max_vivado_depth = max_vivado_depth
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "StreamingFIFO":
+                n_inst = getCustomOp(node)
+                depth = n_inst.get_nodeattr("depth")
+                cfgs = get_fifo_split_configs(
+                    depth, self.max_qsrl_depth, self.max_vivado_depth
+                )
+                if len(cfgs) > 1:
+                    fld_shape = n_inst.get_folded_output_shape()
+                    dtype = n_inst.get_nodeattr("dataType")
+                    ram_style = n_inst.get_nodeattr("ram_style")
+                    shape = model.get_tensor_shape(node.input[0])
+                    for i, (fifo_depth, impl_style) in enumerate(cfgs):
+                        if i == 0:
+                            inp = node.input[0]
+                        else:
+                            inp = node.name + "_" + str(i - 1) + "_out"
+                        if i == len(cfgs) - 1:
+                            outp = node.output[0]
+                        else:
+                            outp = node.name + "_" + str(i) + "_out"
+                            out_tensor = helper.make_tensor_value_info(
+                                outp, TensorProto.FLOAT, shape
+                            )
+                            graph.value_info.append(out_tensor)
+                            model.set_tensor_datatype(out_tensor.name, DataType[dtype])
+                        fifo_node = helper.make_node(
+                            "StreamingFIFO",
+                            [inp],
+                            [outp],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=dtype,
+                            impl_style=impl_style,
+                            ram_style=ram_style,
+                            name=node.name + "_" + str(i),
+                        )
+                        graph.node.insert(node_ind + i, fifo_node)
+
+                    graph.node.remove(node)
+                    graph_modified = True
+        if graph_modified:
+            model = model.transform(SortGraph())
+            model = model.transform(GiveReadableTensorNames())
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 78bcdea0d701f97e9f80d7c7c489aa01bc93fa52..f52bad0ffb35ae4714acc24aef368d01967db426 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -126,6 +126,9 @@ if {$BOARD == "ZCU104"} {
 } elseif {$BOARD == "Pynq-Z1"} {
     set ZYNQ_TYPE "zynq_7000"
     set_property board_part www.digilentinc.com:pynq-z1:part0:1.0 [current_project]
+} elseif {$BOARD == "KV260_SOM"} {
+    set ZYNQ_TYPE "zynq_us+"
+    set_property board_part xilinx.com:kv260_som:part0:1.3 [current_project]
 } else {
     puts "Unrecognized board"
 }
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 4aba87216c8999612f748e989a945ceff33da167..3bc5b803db2072f4d0ed3829adab93b4fbd3b98e 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -40,6 +40,8 @@ pynq_part_map["ZCU102"] = "xczu9eg-ffvb1156-2-e"
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 pynq_part_map["ZCU111"] = "xczu28dr-ffvg1517-2-e"
 pynq_part_map["RFSoC2x2"] = "xczu28dr-ffvg1517-2-e"
+pynq_part_map["KV260_SOM"] = "xck26-sfvc784-2LV-c"
+
 
 # native AXI HP port width (in bits) for PYNQ boards
 pynq_native_port_width = dict()
@@ -50,6 +52,7 @@ pynq_native_port_width["ZCU102"] = 128
 pynq_native_port_width["ZCU104"] = 128
 pynq_native_port_width["ZCU111"] = 128
 pynq_native_port_width["RFSoC2x2"] = 128
+pynq_native_port_width["KV260_SOM"] = 128
 
 # Alveo device and platform mappings
 alveo_part_map = dict()
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index a00899cf784cbe3985b942af6b3d9a4c14cd8706..8d188585694c172d97d73fa6b5820edb7b48a948 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -133,6 +133,7 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000):
     and throughput measurement."""
 
     vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
     build_dir = make_build_dir("verilator_fifosim_")
     fifosim_cpp_fname = pk.resource_filename(
         "finn.qnn-data", "cpp/verilator_fifosim.cpp"
@@ -184,6 +185,19 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000):
     if which_verilator is None:
         raise Exception("'verilator' executable not found")
 
+    # add defines to make certain XPM src files work with Verilator
+    xpm_args = []
+    xpm_args.append("-DDISABLE_XPM_ASSERTIONS")
+    xpm_args.append("-DOBSOLETE")
+    xpm_args.append("-DONESPIN")
+    xpm_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
+    verilog_file_arg = ["finn_design_wrapper.v", xpm_memory, xpm_cdc, xpm_fifo]
+
     verilator_args = [
         "perl",
         which_verilator,
@@ -192,6 +206,8 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000):
         build_dir,
         "-y",
         vivado_stitch_proj_dir,
+        "-y",
+        verilog_header_dir,
         "--CFLAGS",
         "--std=c++11",
         "-O3",
@@ -201,13 +217,14 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000):
         "fast",
         "--noassert",
         "--cc",
-        "finn_design_wrapper.v",
+        *verilog_file_arg,
         "--top-module",
         "finn_design_wrapper",
         "--exe",
         "verilator_fifosim.cpp",
         "--threads",
         "4",
+        *xpm_args,
     ]
 
     proc_env = os.environ.copy()
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index b1655485a0eea84f7afc2273e42eb28376db6381..f4f2b8dbfff0d720ec4eb901704581b096c0ea40 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -51,6 +51,7 @@ def fetch_test_model(topology, wbits=2, abits=2):
 
 @pytest.mark.slow
 @pytest.mark.vivado
+@pytest.mark.fpgadataflow
 @pytest.mark.parametrize(
     "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
 )
diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py
new file mode 100644
index 0000000000000000000000000000000000000000..85b4a2bfa8dc0de3cbdd0ca34ec5b1ee68f37acf
--- /dev/null
+++ b/tests/fpgadataflow/test_split_large_fifos.py
@@ -0,0 +1,128 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pytest
+
+import json
+import shutil
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.transformation.fpgadataflow.set_fifo_depths import get_fifo_split_configs
+from finn.util.basic import make_build_dir
+from finn.util.test import get_trained_network_and_ishape
+
+
+def fetch_test_model(topology, wbits=2, abits=2):
+    tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
+    (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
+    chkpt_name = tmp_output_dir + "/model.onnx"
+    BrevitasONNXManager.export(model, ishape, chkpt_name)
+    return tmp_output_dir
+
+
+def get_folding_cfg(depth=65536):
+    cfg = dict()
+    cfg["Defaults"] = dict()
+    for i in range(3):
+        key = "StreamingFIFO_" + str(i)
+        cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"}
+    return cfg
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize("depth", [16384, 65536, 45000])
+@pytest.mark.parametrize("force_python_rtlsim", ["True", "False"])
+def test_split_large_fifos(depth, force_python_rtlsim):
+    tmp_output_dir = fetch_test_model("tfc")
+    folding_cfg = get_folding_cfg(depth)
+    with open(tmp_output_dir + "/folding_config.json", "w") as f:
+        json.dump(folding_cfg, f, indent=2)
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=False,
+        split_large_fifos=True,
+        folding_config_file=tmp_output_dir + "/folding_config.json",
+        target_fps=10000,
+        force_python_rtlsim=force_python_rtlsim,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        rtlsim_batch_size=100,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+        default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED,
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+    assert (
+        float(sim_data["throughput[images/s]"])
+        / float(est_data["estimated_throughput_fps"])
+        > 0.9
+    )
+    model = ModelWrapper(
+        tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx"
+    )
+    # exclude final FIFO node (output FIFO, not part of test)
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")[:-1]
+    golden_cfg = get_fifo_split_configs(depth, 256, 32768)
+    for i, fifo_node in enumerate(fifo_nodes):
+        inst = getCustomOp(fifo_node)
+        fifo_depth = inst.get_nodeattr("depth")
+        assert fifo_depth == golden_cfg[i % len(golden_cfg)][0]
+
+    shutil.rmtree(tmp_output_dir)
+
+
+def test_split_large_fifo_configs():
+    ret0 = get_fifo_split_configs(513, 256, 32768)
+    assert ret0 == [(512, "vivado"), (1, "rtl")]
+    ret1 = get_fifo_split_configs(1200, 256, 32768)
+    assert ret1 == [(1024, "vivado"), (176, "rtl")]
+    ret2 = get_fifo_split_configs(45000, 256, 32768)
+    assert ret2 == [
+        (32768, "vivado"),
+        (8192, "vivado"),
+        (2048, "vivado"),
+        (1024, "vivado"),
+        (512, "vivado"),
+        (256, "rtl"),
+        (200, "rtl"),
+    ]