diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 7591f09d8d0cd1847672fe5aa09616ff1571033d..f61fbf12da889700297006ef2566088d4150c0e4 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -41,6 +41,13 @@ class LabelSelect_Batch(HLSCustomOp):
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
+        odt_name = self.get_nodeattr("outputDataType")
+        if odt_name == "":
+            # If not provided compute min size
+            labels = self.get_nodeattr("Labels")
+            odt = DataType.get_smallest_possible(labels - 1)
+            odt_name = odt.name
+            self.set_nodeattr("outputDataType", odt_name)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -49,6 +56,7 @@ class LabelSelect_Batch(HLSCustomOp):
             "K": ("i", True, 0),
             # FINN DataTypes for input
             "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", False, ""),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -69,7 +77,6 @@ class LabelSelect_Batch(HLSCustomOp):
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
         assert nlabels % pe == 0, "PE must divide Labels"
-        assert pe == 1, "LabelSelect currently fails with folding"
         folds = int(nlabels / pe)
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
@@ -90,7 +97,7 @@ class LabelSelect_Batch(HLSCustomOp):
         exp_ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
-        assert ishape == exp_ishape, "Unexpect input shape."
+        assert ishape == exp_ishape, "Unexpected input shape."
         # implement tensor with correct shape
         values = np.random.randn(*oshape).astype(np.int64)
         return helper.make_node(
@@ -106,9 +113,8 @@ class LabelSelect_Batch(HLSCustomOp):
         )
 
     def infer_node_datatype(self, model):
-        # currently set to uint32 to be compatible with hlslib
-        # enhancement: consider finding smallest power-of-two int for reduced output bandwidth
-        model.set_tensor_datatype(self.onnx_node.output[0], DataType.UINT32)
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
 
     def verify_node(self):
         info_messages = []
@@ -134,6 +140,7 @@ class LabelSelect_Batch(HLSCustomOp):
             self.get_nodeattr("PE")
             self.get_nodeattr("K")
             self.get_nodeattr("inputDataType")
+            self.get_nodeattr("outputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
             info_messages.append(
@@ -150,12 +157,12 @@ class LabelSelect_Batch(HLSCustomOp):
     def get_input_datatype(self):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
-        assert ret.signed() is False, "LabelSelect is currently broken for signed inputs"
         return ret
 
     def get_output_datatype(self):
         """Returns FINN DataType of output."""
-        return DataType.UINT32
+        ret = DataType[self.get_nodeattr("outputDataType")]
+        return ret
 
     def get_instream_width(self):
         """Returns input stream width."""
@@ -260,8 +267,13 @@ class LabelSelect_Batch(HLSCustomOp):
         npy_type = "float"
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
+
+        # Calling npy2apintstream with reverse_inner = false to have LE packing
+        # as required by HLS fxn LabelSelect_Batch
+        # Also notice that StreamingDataWidthConverter_Batch performs LE packing
+
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
 
@@ -277,12 +289,13 @@ class LabelSelect_Batch(HLSCustomOp):
     def docompute(self):
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<{}, {}, {}, {}, ap_uint<32>> (in0, out, 1);""".format(
+            """{}<{}, {}, {}, {}, {} > (in0, out, 1);""".format(
                 node.op_type,
                 self.get_nodeattr("Labels"),
                 self.get_nodeattr("PE"),
                 self.get_nodeattr("K"),
                 self.get_input_datatype().get_hls_datatype_str(),
+                self.get_output_datatype().get_hls_datatype_str(),
             )
         ]
 
@@ -316,10 +329,11 @@ class LabelSelect_Batch(HLSCustomOp):
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
             """void {}(hls::stream<ap_uint<{}*{}>> &in0,
-                hls::stream<ap_uint<32>> &out)""".format(
+                hls::stream<ap_uint<{}> > &out)""".format(
                 self.onnx_node.name,
                 self.get_nodeattr("PE"),
                 self.get_input_datatype().bitwidth(),
+                self.get_output_datatype().bitwidth(),
             )
         ]
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index 2df841728395229dafe33d2804c44a3489ef3e45..9bc77cd47fd6115823f9a35d98e8874ee3f98b2d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
@@ -70,7 +71,8 @@ def make_labelselect_modelwrapper(labels, pe, k, idt):
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", DataType.UINT32)
+    odt = DataType.get_smallest_possible(labels - 1)
+    model.set_tensor_datatype("outp", odt)
 
     return model
 
@@ -79,19 +81,18 @@ def prepare_inputs(input_tensor, idt):
     return {"inp": input_tensor}
 
 
-# TODO: folded inputs fail, likely problem in hlslib
-# input datatype -- checked by assertion in HLSCustomOp
-@pytest.mark.parametrize("idt", [DataType.UINT8, DataType.UINT16])
+@pytest.mark.parametrize("idt", [DataType.UINT8, DataType.UINT16, DataType.INT16])
 # labels
-@pytest.mark.parametrize("labels", [10, 1000])
+@pytest.mark.parametrize("labels", [10, 100])
 # folding
-@pytest.mark.parametrize("fold", [-1])
+@pytest.mark.parametrize("fold", [-1, 2, 10])
 # number of top labels to select
 @pytest.mark.parametrize("k", [1, 5])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.vivado
 def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode):
+    np.random.seed(0)
     if fold == -1:
         pe = 1
     else: