From d0fab0cf8aba8f2cf3cb34be50040d6b3b4d1448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20D=C5=BEado=C5=88?= <524839@mail.muni.cz> Date: Tue, 30 Jun 2026 14:27:19 +0000 Subject: [PATCH 1/3] fix: access logical dataset --- rationai/mlkit/data/datasets/slides_tiles_loader.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py index 7359965..afe6e2b 100644 --- a/rationai/mlkit/data/datasets/slides_tiles_loader.py +++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py @@ -76,9 +76,12 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: if len(tiles) == 0: return {} - # 1. Grab the column directly from the underlying PyArrow Table - slide_ids = tiles.data.column("slide_id") - num_rows = len(slide_ids) + # 1. Read slide_id through HFDataset's logical interface so that any + # prior .filter() / .select() on the dataset is respected. + # Accessing tiles.data.column() directly bypasses the _indices + # mapping and returns physical-table rows, causing stale offsets. + slide_ids = pa.array(tiles["slide_id"]) + num_rows = len(tiles) # 2. Handle the "Large" type conversion current_type = slide_ids.type @@ -87,8 +90,7 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: elif pa.types.is_binary(current_type): slide_ids = slide_ids.cast(pa.large_binary()) - # 3. Generate sequential row indices - # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead + # 3. Generate sequential logical row indices row_indices = pa.array(np.arange(num_rows, dtype=np.int64)) # 4. Combine them into a lightweight PyArrow Table From 83a86b8609861b5a08125f9443075dd4612722ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20D=C5=BEado=C5=88?= <524839@mail.muni.cz> Date: Tue, 30 Jun 2026 15:20:06 +0000 Subject: [PATCH 2/3] fix: comments --- rationai/mlkit/data/datasets/slides_tiles_loader.py | 5 ++--- uv.lock | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py index afe6e2b..996a8c6 100644 --- a/rationai/mlkit/data/datasets/slides_tiles_loader.py +++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py @@ -78,8 +78,6 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: # 1. Read slide_id through HFDataset's logical interface so that any # prior .filter() / .select() on the dataset is respected. - # Accessing tiles.data.column() directly bypasses the _indices - # mapping and returns physical-table rows, causing stale offsets. slide_ids = pa.array(tiles["slide_id"]) num_rows = len(tiles) @@ -90,7 +88,8 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: elif pa.types.is_binary(current_type): slide_ids = slide_ids.cast(pa.large_binary()) - # 3. Generate sequential logical row indices + # 3. Generate sequential row indices + # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead row_indices = pa.array(np.arange(num_rows, dtype=np.int64)) # 4. Combine them into a lightweight PyArrow Table diff --git a/uv.lock b/uv.lock index 6c5d097..cb69bbf 100644 --- a/uv.lock +++ b/uv.lock @@ -2287,7 +2287,7 @@ dependencies = [ [[package]] name = "rationai-mlkit" -version = "0.4.0" +version = "0.4.1" source = { virtual = "." } dependencies = [ { name = "datasets" }, From ea66d2cbe0548d6a355ff812d238cba6764f85fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20D=C5=BEado=C5=88?= <524839@mail.muni.cz> Date: Tue, 30 Jun 2026 15:24:40 +0000 Subject: [PATCH 3/3] feat: pre-format --- rationai/mlkit/data/datasets/slides_tiles_loader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py index 996a8c6..d487ebe 100644 --- a/rationai/mlkit/data/datasets/slides_tiles_loader.py +++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py @@ -76,9 +76,8 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: if len(tiles) == 0: return {} - # 1. Read slide_id through HFDataset's logical interface so that any - # prior .filter() / .select() on the dataset is respected. - slide_ids = pa.array(tiles["slide_id"]) + # 1. Read slide_id as Arrow data, respecting any prior .filter() / .select(). + slide_ids = tiles.with_format("arrow")["slide_id"] num_rows = len(tiles) # 2. Handle the "Large" type conversion