Skip to content

Commit 795a36e

Browse files
authored
[FLINK-39399][table-runtime] Fix integer overflow in HyperLogLogPlusPlus causing APPROX_COUNT_DISTINCT undercount
This closes #27895.
1 parent b1c858e commit 795a36e

2 files changed

Lines changed: 32 additions & 2 deletions

File tree

flink-table/flink-table-runtime/src/main/java/org/apache/flink/table/runtime/functions/aggregate/hyperloglog/HyperLogLogPlusPlus.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4946,8 +4946,8 @@ public long query(HllBuffer buffer) {
49464946
int i = 0;
49474947
int shift = 0;
49484948
while (idx < m && i < REGISTERS_PER_WORD) {
4949-
long mIdx = (word >>> shift) & REGISTER_WORD_MASK;
4950-
zInverse += 1.0 / (1 << mIdx);
4949+
int mIdx = (int) ((word >>> shift) & REGISTER_WORD_MASK);
4950+
zInverse += 1.0 / (1L << mIdx);
49514951
if (mIdx == 0) {
49524952
v += 1.0d;
49534953
}

flink-table/flink-table-runtime/src/test/java/org/apache/flink/table/runtime/functions/aggregate/hyperloglog/HyperLogLogPlusPlusTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,36 @@ private void testCardinalityEstimates(
159159
}
160160
}
161161

162+
@Test
163+
void testQueryWithRegisterValuesAbove32() {
164+
// Directly construct an HLL buffer where every register holds value 35 (>= 32).
165+
// Before the fix, "1 << mIdx" used int shift which wraps for mIdx >= 32
166+
// (1 << 35 == 1 << 3 == 8), producing incorrect estimates.
167+
// The fix uses "1L << mIdx" (long shift) so 1L << 35 == 34359738368.
168+
int registerValue = 35;
169+
HyperLogLogPlusPlus hll = new HyperLogLogPlusPlus(0.01);
170+
HllBuffer buffer = createHllBuffer(hll);
171+
172+
// Pack each word with 10 registers (6 bits each) all set to registerValue.
173+
for (int w = 0; w < hll.getNumWords(); w++) {
174+
long word = 0L;
175+
for (int r = 0; r < 10; r++) {
176+
word |= ((long) registerValue) << (r * 6);
177+
}
178+
buffer.array[w] = word;
179+
}
180+
181+
long estimate = hll.query(buffer);
182+
183+
// With correct long shift, the estimate should be astronomically large
184+
// (on the order of alpha * m * 2^35 ~ 4e14).
185+
// With the buggy int shift, the estimate would be around 95K.
186+
// Assert the estimate is at least 1e12 to catch the int-shift bug.
187+
assertThat(estimate)
188+
.as("Estimate should reflect long-shift math for register values >= 32")
189+
.isGreaterThan(1_000_000_000_000L);
190+
}
191+
162192
public HllBuffer createHllBuffer(HyperLogLogPlusPlus hll) {
163193
HllBuffer buffer = new HllBuffer();
164194
buffer.array = new long[hll.getNumWords()];

0 commit comments

Comments
 (0)