voxel51 · CamronStaley · Dec 6, 2024 · brimoor · Dec 14, 2024 · CamronStaley
diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
@@ -3392,10 +3392,6 @@ def _add_samples_batch(
             if sample.media_type == fom.VIDEO:
                 sample.frames.save()
 
-        if batcher is not None and batcher.manual_backpressure:
-            # @todo can we infer content size from insert_many() above?
-            batcher.apply_backpressure(dicts)
-
         return [str(d["_id"]) for d in dicts]
 
     def _upsert_samples(
@@ -3459,10 +3455,6 @@ def _upsert_samples_batch(
             if sample.media_type == fom.VIDEO:
                 sample.frames.save()
 
-        if batcher is not None and batcher.manual_backpressure:
-            # @todo can we infer content size from bulk_write() above?
-            batcher.apply_backpressure(dicts)
-
     def _make_dict(
         self,
         sample,

diff --git a/fiftyone/core/odm/database.py b/fiftyone/core/odm/database.py
@@ -809,9 +809,6 @@ def insert_documents(docs, coll, ordered=False, progress=None, num_docs=None):
                 batch = list(batch)
                 coll.insert_many(batch, ordered=ordered)
                 ids.extend(b["_id"] for b in batch)
-                if batcher.manual_backpressure:
-                    # @todo can we infer content size from insert_many() above?
-                    batcher.apply_backpressure(batch)
 
     except BulkWriteError as bwe:
         msg = bwe.details["writeErrors"][0]["errmsg"]
@@ -838,11 +835,6 @@ def bulk_write(ops, coll, ordered=False, progress=False):
             for batch in batcher:
                 batch = list(batch)
                 coll.bulk_write(batch, ordered=ordered)
-                if batcher.manual_backpressure:
-                    # @todo can we infer content size from bulk_write() above?
-                    # @todo do we need a more accurate measure of size here?
-                    content_size = sum(len(str(b)) for b in batch)
-                    batcher.apply_backpressure(content_size)
 
     except BulkWriteError as bwe:
         msg = bwe.details["writeErrors"][0]["errmsg"]

diff --git a/fiftyone/core/utils.py b/fiftyone/core/utils.py
@@ -1552,6 +1552,117 @@ def _compute_batch_size(self):
         return self.batch_size
 
 
+class ContentSizeBatcher(Batcher):
+    """Class for iterating over the elements of an iterable with a dynamic
+    batch size to achieve a desired content size.
+
+    The batch sizes emitted when iterating over this object are dynamically
+    scaled such that the total content size of the batch is as close as
+    possible to a specified target size.
+
+    This batcher does not require backpressure feedback because it calculates
+    the total size of the iterable object before batching.
+
+    This class is often used in conjunction with a :class:`ProgressBar` to keep
+    the user appraised on the status of a long-running task.
+
+    Example usage::
+
+        import fiftyone.core.utils as fou
+
+        elements = range(int(1e7))
+
+        batcher = fou.ContentSizeBatcher(
+            elements,
+            target_size=2**20,
+            progress=True
+        )
+
+        with batcher:
+            for batch in batcher:
+                print("batch size: %d" % len(batch))
+
+    Args:
+        iterable: an iterable to batch over. If ``None``, the result of
+            ``next()`` will be a batch size instead of a batch, and is an
+            infinite iterator.
+        target_size (1048576): the target batch bson content size, in bytes
+        min_batch_size (1): the minimum allowed batch size
+        max_batch_size (None): an optional maximum allowed batch size
+        return_views (False): whether to return each batch as a
+            :class:`fiftyone.core.view.DatasetView`. Only applicable when the
+            iterable is a :class:`fiftyone.core.collections.SampleCollection`
+        progress (False): whether to render a progress bar tracking the
+            consumption of the batches (True/False), use the default value
+            ``fiftyone.config.show_progress_bars`` (None), or a progress
+            callback function to invoke instead
+        total (None): the length of ``iterable``. Only applicable when
+            ``progress=True``. If not provided, it is computed via
+            ``len(iterable)``, if possible
+    """
+
+    def __init__(
+        self,
+        iterable,
+        target_size=2**20,
+        min_batch_size=1,
+        max_batch_size=None,
+        return_views=False,
+        progress=False,
+        total=None,
+    ):
+        iterable, iterable_copy = itertools.tee(iterable)
+        super().__init__(
+            iterable, return_views=return_views, progress=progress, total=total
+        )
+        self.batch_sizes = self._compute_batch_sizes(
+            target_size, min_batch_size, max_batch_size, iterable_copy
+        )
+        self.curr_batch = 0
+
+    def _compute_batch_sizes(
+        self, target_size, min_batch_size, max_batch_size, iterable
+    ):
+        batch_sizes = []
+        curr_batch_size = 0
+        curr_batch_content_size = 0
+
+        for obj in iterable:
+            try:
+                curr_batch_content_size += len(
+                    json_util.dumps(self._make_dict(obj))
+                )
+            except Exception:
+                curr_batch_content_size += len(str(obj))
+            curr_batch_size += 1
+            if curr_batch_size >= min_batch_size and (
+                curr_batch_content_size >= target_size
+                or curr_batch_size == max_batch_size
+            ):
+                batch_sizes.append(curr_batch_size)
+                curr_batch_size = 0
+                curr_batch_content_size = 0
+
+        if curr_batch_size:
+            batch_sizes.append(curr_batch_size)
+
+        return batch_sizes
+
+    def _make_dict(self, obj):
+        return (
+            obj.to_mongo_dict(include_id=True)
+            if hasattr(obj, "to_mongo_dict")
+            else obj
+        )
+
+    def _compute_batch_size(self):
+        size = 1
+        if self.curr_batch < len(self.batch_sizes):
+            size = self.batch_sizes[self.curr_batch]
+        self.curr_batch += 1
+        return size
+
+
 def get_default_batcher(iterable, progress=False, total=None):
     """Returns a :class:`Batcher` over ``iterable`` using defaults from your
     FiftyOne config.
@@ -1588,11 +1699,9 @@ def get_default_batcher(iterable, progress=False, total=None):
         )
     elif default_batcher == "size":
         target_content_size = fo.config.batcher_target_size_bytes
-        return ContentSizeDynamicBatcher(
-            iterable,
+        return ContentSizeBatcher(
+            iterable=iterable,
             target_size=target_content_size,
-            init_batch_size=1,
-            max_batch_beta=8.0,
             max_batch_size=100000,
             progress=progress,
             total=total,

diff --git a/tests/unittests/utils_tests.py b/tests/unittests/utils_tests.py
@@ -12,6 +12,7 @@
 from unittest.mock import MagicMock, patch
 
 from bson import ObjectId
+from bson import json_util
 import numpy as np
 
 import fiftyone as fo
@@ -59,10 +60,7 @@ def test_get_default_batcher(self):
                 target_size,
             ):
                 batcher = fou.get_default_batcher(iterable)
-                self.assertTrue(
-                    isinstance(batcher, fou.ContentSizeDynamicBatcher)
-                )
-                self.assertEqual(batcher.target_measurement, target_size)
+                self.assertTrue(isinstance(batcher, fou.ContentSizeBatcher))
 
         with patch.object(fo.config, "default_batcher", "invalid"):
             self.assertRaises(ValueError, fou.get_default_batcher, iterable)
@@ -84,6 +82,49 @@ def test_static_batcher_covered(self):
             batches = [batch for batch in batcher]
             self.assertListEqual(batches, [iterable])
 
+    def test_content_size_batcher(self):
+        n = 10
+        samples = [fo.Sample(filepath=f"{i}.jpg") for i in range(n)]
+
+        # Test min batch size same as total size
+        batcher = fou.ContentSizeBatcher(iter(samples), min_batch_size=n)
+        expected = [n]
+        self.assertListEqual(expected, batcher.batch_sizes)
+        self.assertEqual(n, sum(batcher.batch_sizes))
+
+        # Test max batch size same as min_size and less than target
+        batcher = fou.ContentSizeBatcher(iter(samples), max_batch_size=1)
+        expected = [1] * n
+        self.assertListEqual(expected, batcher.batch_sizes)
+        self.assertEqual(n, sum(batcher.batch_sizes))
+
+        # Test default case
+        batcher = fou.ContentSizeBatcher(iter(samples))
+        expected = [n]
+        self.assertListEqual(expected, batcher.batch_sizes)
+        self.assertEqual(n, sum(batcher.batch_sizes))
+
+        # Test target smaller than min
+        batcher = fou.ContentSizeBatcher(iter(samples), target_size=1)
+        expected = [1] * n
+        self.assertListEqual(expected, batcher.batch_sizes)
+
+        # Test target size half of total
+        total_size = len(
+            json_util.dumps(
+                [sample.to_mongo_dict(include_id=True) for sample in samples]
+            )
+        )
+        target_size = (
+            total_size // 2 - 100
+        )  # offset because the items slightly differ in size
+        expected = [n // 2] * 2
+        batcher = fou.ContentSizeBatcher(
+            iter(samples), target_size=target_size
+        )
+        self.assertListEqual(expected, batcher.batch_sizes)
+        self.assertEqual(n, sum(batcher.batch_sizes))
+
     def test_static_batcher_perfect_boundary(self):
         iterable = list(range(200))
         batcher = fou.StaticBatcher(iterable, batch_size=100, progress=False)