From 4ea27dcf27d902bc23aba3408d983580644d1fd1 Mon Sep 17 00:00:00 2001 From: Jordan Epstein Date: Tue, 9 Jun 2026 16:12:12 -0500 Subject: [PATCH] GH-1179: Fix VectorAppender data size computation for variable-width vectors with non-zero start offsets VectorAppender computed the delta vector's data size as its last offset value, which is only correct when the offset buffer starts at zero. Vectors imported through the C data interface from sliced arrays can have a non-zero first offset; appending them copied the unreferenced data buffer prefix into the target, inflating it on every append until allocation eventually failed with OversizedAllocationException. Compute the data size as the distance between the first and last offsets, copy from the first offset, and rebase appended offsets accordingly. Fixes #1179. --- .../arrow/vector/util/VectorAppender.java | 31 ++++--- .../arrow/vector/util/TestVectorAppender.java | 80 +++++++++++++++++++ 2 files changed, 100 insertions(+), 11 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index e7c0d11cb9..d522ffe6f5 100644 --- a/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -125,10 +125,15 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { targetVector .getOffsetBuffer() .getInt((long) targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + // The delta vector's offset buffer need not start at zero (e.g. a vector imported through + // the C data interface from a sliced array), so the amount of data to append is the + // distance between its first and last offsets, not the last offset itself. + int deltaDataStart = deltaVector.getOffsetBuffer().getInt(0); int deltaDataSize = deltaVector - .getOffsetBuffer() - .getInt((long) deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + .getOffsetBuffer() + .getInt((long) deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH) + - deltaDataStart; int newValueCapacity = targetDataSize + deltaDataSize; // make sure there is enough capacity @@ -149,7 +154,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { // append data buffer MemoryUtil.copyMemory( - deltaVector.getDataBuffer().memoryAddress(), + deltaVector.getDataBuffer().memoryAddress() + deltaDataStart, targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); @@ -160,7 +165,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { + (targetVector.getValueCount() + 1) * BaseVariableWidthVector.OFFSET_WIDTH, deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); - // increase each offset from the second buffer + // rebase each appended offset to the target's data, accounting for the delta's start offset for (int i = 0; i < deltaVector.getValueCount(); i++) { int oldOffset = targetVector @@ -172,7 +177,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { .getOffsetBuffer() .setInt( (long) (targetVector.getValueCount() + 1 + i) * BaseVariableWidthVector.OFFSET_WIDTH, - oldOffset + targetDataSize); + oldOffset - deltaDataStart + targetDataSize); } ((BaseVariableWidthVector) targetVector).setLastSet(newValueCount - 1); targetVector.setValueCount(newValueCount); @@ -196,11 +201,15 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { .getOffsetBuffer() .getLong( (long) targetVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH); + // see the corresponding comment in visit(BaseVariableWidthVector, Void): the delta's + // offset buffer need not start at zero + long deltaDataStart = deltaVector.getOffsetBuffer().getLong(0); long deltaDataSize = deltaVector - .getOffsetBuffer() - .getLong( - (long) deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH); + .getOffsetBuffer() + .getLong( + (long) deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH) + - deltaDataStart; long newValueCapacity = targetDataSize + deltaDataSize; // make sure there is enough capacity @@ -221,7 +230,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { // append data buffer MemoryUtil.copyMemory( - deltaVector.getDataBuffer().memoryAddress(), + deltaVector.getDataBuffer().memoryAddress() + deltaDataStart, targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); @@ -232,7 +241,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { + (targetVector.getValueCount() + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH, deltaVector.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH); - // increase each offset from the second buffer + // rebase each appended offset to the target's data, accounting for the delta's start offset for (int i = 0; i < deltaVector.getValueCount(); i++) { long oldOffset = targetVector @@ -245,7 +254,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { .setLong( (long) (targetVector.getValueCount() + 1 + i) * BaseLargeVariableWidthVector.OFFSET_WIDTH, - oldOffset + targetDataSize); + oldOffset - deltaDataStart + targetDataSize); } ((BaseLargeVariableWidthVector) targetVector).setLastSet(newValueCount - 1); targetVector.setValueCount(newValueCount); diff --git a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java index df5521a1ad..dbbe5d3be5 100644 --- a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java +++ b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -26,10 +26,13 @@ import java.util.List; import java.util.stream.IntStream; import java.util.stream.Stream; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; @@ -53,6 +56,7 @@ import org.apache.arrow.vector.holders.NullableBigIntHolder; import org.apache.arrow.vector.holders.NullableFloat4Holder; import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.testing.ValueVectorDataPopulator; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -178,6 +182,82 @@ public void testAppendVariableWidthVector() { } } + @Test + public void testAppendVariableWidthVectorWithNonZeroStartOffset() { + try (VarCharVector target = new VarCharVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + target.allocateNew(64, 4); + ValueVectorDataPopulator.setVector(target, "a0", "a1"); + + // Build a delta vector whose offset buffer does not start at zero, as produced e.g. by + // importing a sliced array through the C data interface. The values are "BBBB" and + // "CCCC"; the data buffer additionally holds 4 bytes of unreferenced prefix ("AAAA"). + try (ArrowBuf validity = allocator.buffer(1); + ArrowBuf offsets = allocator.buffer(12); + ArrowBuf data = allocator.buffer(12)) { + validity.setByte(0, 0b11); + offsets.setInt(0, 4); + offsets.setInt(4, 8); + offsets.setInt(8, 12); + data.setBytes(0, "AAAABBBBCCCC".getBytes(StandardCharsets.UTF_8)); + delta.loadFieldBuffers(new ArrowFieldNode(2, 0), Arrays.asList(validity, offsets, data)); + } + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + // the unreferenced prefix must not be appended + assertEquals( + 4 + 8, + target + .getOffsetBuffer() + .getInt((long) target.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH)); + + try (VarCharVector expected = new VarCharVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, "a0", "a1", "BBBB", "CCCC"); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendLargeVariableWidthVectorWithNonZeroStartOffset() { + try (LargeVarCharVector target = new LargeVarCharVector("", allocator); + LargeVarCharVector delta = new LargeVarCharVector("", allocator)) { + + target.allocateNew(64, 4); + ValueVectorDataPopulator.setVector(target, "a0", "a1"); + + try (ArrowBuf validity = allocator.buffer(1); + ArrowBuf offsets = allocator.buffer(24); + ArrowBuf data = allocator.buffer(12)) { + validity.setByte(0, 0b11); + offsets.setLong(0, 4); + offsets.setLong(8, 8); + offsets.setLong(16, 12); + data.setBytes(0, "AAAABBBBCCCC".getBytes(StandardCharsets.UTF_8)); + delta.loadFieldBuffers(new ArrowFieldNode(2, 0), Arrays.asList(validity, offsets, data)); + } + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals( + 4 + 8, + target + .getOffsetBuffer() + .getLong((long) target.getValueCount() * BaseLargeVariableWidthVector.OFFSET_WIDTH)); + + try (LargeVarCharVector expected = new LargeVarCharVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, "a0", "a1", "BBBB", "CCCC"); + assertVectorsEqual(expected, target); + } + } + } + @Test public void testAppendVariableWidthViewVector() { final int length1 = 10;