diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7591f09d8d0cd1847672fe5aa09616ff1571033d
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -0,0 +1,331 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class LabelSelect_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib LabelSelect_Batch function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "Labels": ("i", True, 0),
+            "PE": ("i", True, 0),
+            "K": ("i", True, 0),
+            # FINN DataTypes for input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        nlabels = self.get_nodeattr("Labels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [nlabels])
+        return ishape
+
+    def get_folded_input_shape(self):
+        nlabels = self.get_nodeattr("Labels")
+        pe = self.get_nodeattr("PE")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        assert nlabels % pe == 0, "PE must divide Labels"
+        assert pe == 1, "LabelSelect currently fails with folding"
+        folds = int(nlabels / pe)
+        folded_ishape = tuple(vecs + [folds, pe])
+        return folded_ishape
+
+    def get_normal_output_shape(self):
+        k = self.get_nodeattr("K")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        oshape = tuple(vecs + [k])
+        return oshape
+
+    def get_folded_output_shape(self):
+        k = self.get_nodeattr("K")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        oshape = tuple(vecs + [k, 1])
+        return oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.int64)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.INT64,
+                dims=values.shape,
+                vals=values.flatten(),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        # currently set to uint32 to be compatible with hlslib
+        # enhancement: consider finding smallest power-of-two int for reduced output bandwidth
+        model.set_tensor_datatype(self.onnx_node.output[0], DataType.UINT32)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("Labels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("K")
+            self.get_nodeattr("inputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required LabelSelect_Batch attributes do not exist."""
+            )
+
+        # verify that input data is 1D
+        if len(self.get_nodeattr("numInputVectors")) > 1:
+            info_messages.append("""LabelSelect_Batch requires 1D data input.""")
+            raise Exception
+
+        return info_messages
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        assert ret.signed() is False, "LabelSelect is currently broken for signed inputs"
+        return ret
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType.UINT32
+
+    def get_instream_width(self):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self):
+        """Returns output stream width."""
+        return self.get_output_datatype().bitwidth()
+
+    def get_number_output_values(self):
+        return self.get_nodeattr("K")
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == exp_ishape, """Input shape doesn't match expected shape ."""
+        export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim \
+            did not produce expected ofolded utput shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        node = self.onnx_node
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<{}, {}, {}, {}, ap_uint<32>> (in0, out, 1);""".format(
+                node.op_type,
+                self.get_nodeattr("Labels"),
+                self.get_nodeattr("PE"),
+                self.get_nodeattr("K"),
+                self.get_input_datatype().get_hls_datatype_str(),
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}*{}>> &in0,
+                hls::stream<ap_uint<32>> &out)""".format(
+                self.onnx_node.name,
+                self.get_nodeattr("PE"),
+                self.get_input_datatype().bitwidth(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index e72cc5b1a23f04bae978c6f32ba7e45760824349..514b0bf62285faaf32c7baa063bbab3622e280f0 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -44,6 +44,7 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
+from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 
 # create a mapping of all known CustomOp names and classes
 custom_op = {}
@@ -60,6 +61,7 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
+custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 
 
 def getCustomOp(node):
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index f29af66f52e18f8aeed87e19bfb52354ca1b73a7..b0985bddd3306f81ad0e81f0fb582a0ae7fdaf3d 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from brevitas_examples import bnn_pynq
+import numpy as np
 import pytest
 import warnings
 from finn.core.modelwrapper import ModelWrapper
@@ -66,6 +67,15 @@ def get_test_model_untrained(netname, wbits, abits):
     return get_test_model(netname, wbits, abits, pretrained=False)
 
 
+def soft_verify_topk(invec, idxvec, k):
+    """Check that the topK indices provided actually point to the topK largest
+    values in the input vector"""
+    np_topk = np.flip(invec.flatten().argsort())[:k]
+    soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()]
+    soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()]
+    return (soft_expected == soft_produced).all()
+
+
 def load_test_checkpoint_or_skip(filename):
     "Try to load given .onnx and return ModelWrapper, else skip current test."
     try:
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df841728395229dafe33d2804c44a3489ef3e45
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.test import soft_verify_topk
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def make_labelselect_modelwrapper(labels, pe, k, idt):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, labels])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, k])
+
+    labelselect_node = helper.make_node(
+        "LabelSelect_Batch",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        Labels=labels,
+        PE=pe,
+        K=k,
+        inputDataType=idt.name,
+    )
+    graph = helper.make_graph(
+        nodes=[labelselect_node], name="graph", inputs=[inp], outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", DataType.UINT32)
+
+    return model
+
+
+def prepare_inputs(input_tensor, idt):
+    return {"inp": input_tensor}
+
+
+# TODO: folded inputs fail, likely problem in hlslib
+# input datatype -- checked by assertion in HLSCustomOp
+@pytest.mark.parametrize("idt", [DataType.UINT8, DataType.UINT16])
+# labels
+@pytest.mark.parametrize("labels", [10, 1000])
+# folding
+@pytest.mark.parametrize("fold", [-1])
+# number of top labels to select
+@pytest.mark.parametrize("k", [1, 5])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = labels // fold
+    assert labels % pe == 0
+
+    if k == -1:
+        k = labels
+
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, labels))
+
+    model = make_labelselect_modelwrapper(labels, pe, k, idt)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # prepare input data and execute
+    input_dict = prepare_inputs(x, idt)
+    y = oxe.execute_onnx(model, input_dict)["outp"]
+
+    assert soft_verify_topk(x, y, k), exec_mode + " failed"