From d0fab0cf8aba8f2cf3cb34be50040d6b3b4d1448 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20D=C5=BEado=C5=88?= <524839@mail.muni.cz>
Date: Tue, 30 Jun 2026 14:27:19 +0000
Subject: [PATCH 1/3] fix: access logical dataset

---
 rationai/mlkit/data/datasets/slides_tiles_loader.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py
index 7359965..afe6e2b 100644
--- a/rationai/mlkit/data/datasets/slides_tiles_loader.py
+++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py
@@ -76,9 +76,12 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]:
         if len(tiles) == 0:
             return {}
 
-        # 1. Grab the column directly from the underlying PyArrow Table
-        slide_ids = tiles.data.column("slide_id")
-        num_rows = len(slide_ids)
+        # 1. Read slide_id through HFDataset's logical interface so that any
+        #    prior .filter() / .select() on the dataset is respected.
+        #    Accessing tiles.data.column() directly bypasses the _indices
+        #    mapping and returns physical-table rows, causing stale offsets.
+        slide_ids = pa.array(tiles["slide_id"])
+        num_rows = len(tiles)
 
         # 2. Handle the "Large" type conversion
         current_type = slide_ids.type
@@ -87,8 +90,7 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]:
         elif pa.types.is_binary(current_type):
             slide_ids = slide_ids.cast(pa.large_binary())
 
-        # 3. Generate sequential row indices
-        # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead
+        # 3. Generate sequential logical row indices
         row_indices = pa.array(np.arange(num_rows, dtype=np.int64))
 
         # 4. Combine them into a lightweight PyArrow Table

From 83a86b8609861b5a08125f9443075dd4612722ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20D=C5=BEado=C5=88?= <524839@mail.muni.cz>
Date: Tue, 30 Jun 2026 15:20:06 +0000
Subject: [PATCH 2/3] fix: comments

---
 rationai/mlkit/data/datasets/slides_tiles_loader.py | 5 ++---
 uv.lock                                             | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py
index afe6e2b..996a8c6 100644
--- a/rationai/mlkit/data/datasets/slides_tiles_loader.py
+++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py
@@ -78,8 +78,6 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]:
 
         # 1. Read slide_id through HFDataset's logical interface so that any
         #    prior .filter() / .select() on the dataset is respected.
-        #    Accessing tiles.data.column() directly bypasses the _indices
-        #    mapping and returns physical-table rows, causing stale offsets.
         slide_ids = pa.array(tiles["slide_id"])
         num_rows = len(tiles)
 
@@ -90,7 +88,8 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]:
         elif pa.types.is_binary(current_type):
             slide_ids = slide_ids.cast(pa.large_binary())
 
-        # 3. Generate sequential logical row indices
+        # 3. Generate sequential row indices
+        # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead
         row_indices = pa.array(np.arange(num_rows, dtype=np.int64))
 
         # 4. Combine them into a lightweight PyArrow Table
diff --git a/uv.lock b/uv.lock
index 6c5d097..cb69bbf 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2287,7 +2287,7 @@ dependencies = [
 
 [[package]]
 name = "rationai-mlkit"
-version = "0.4.0"
+version = "0.4.1"
 source = { virtual = "." }
 dependencies = [
     { name = "datasets" },

From ea66d2cbe0548d6a355ff812d238cba6764f85fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20D=C5=BEado=C5=88?= <524839@mail.muni.cz>
Date: Tue, 30 Jun 2026 15:24:40 +0000
Subject: [PATCH 3/3] feat: pre-format

---
 rationai/mlkit/data/datasets/slides_tiles_loader.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py
index 996a8c6..d487ebe 100644
--- a/rationai/mlkit/data/datasets/slides_tiles_loader.py
+++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py
@@ -76,9 +76,8 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]:
         if len(tiles) == 0:
             return {}
 
-        # 1. Read slide_id through HFDataset's logical interface so that any
-        #    prior .filter() / .select() on the dataset is respected.
-        slide_ids = pa.array(tiles["slide_id"])
+        # 1. Read slide_id as Arrow data, respecting any prior .filter() / .select().
+        slide_ids = tiles.with_format("arrow")["slide_id"]
         num_rows = len(tiles)
 
         # 2. Handle the "Large" type conversion