sei-protocol · cody-littley · Jun 9, 2026 · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/sei-db/db_engine/litt/disktable/segment/segment_test.go b/sei-db/db_engine/litt/disktable/segment/segment_test.go
@@ -816,6 +816,70 @@ func reloadSegmentExpectingRecovery(t *testing.T, segmentPath *SegmentPath, inde
 	return keys, seg
 }
 
+// TestSealLoadedSegmentSingleShardPrefix locks the single-shard durability invariant: with one shard
+// all values append to a single value file and all keys to a single key file in write order, so after
+// a crash the surviving Put groups form a contiguous PREFIX of the write order — never a gapped
+// subset. Each sub-case seals a segment, truncates one file to simulate a torn tail, reloads, and
+// asserts the survivors are exactly key000..key{j-1}.
+func TestSealLoadedSegmentSingleShardPrefix(t *testing.T) {
+	t.Parallel()
+
+	const (
+		n        = 8
+		valueLen = 10 // each value below is exactly 10 bytes
+	)
+	keyFor := func(i int) []byte { return []byte(fmt.Sprintf("key%03d", i)) }
+	valueFor := func(i int) []byte { return []byte(fmt.Sprintf("val%07d", i)) }
+
+	// writeRun writes n standalone Puts to a fresh single-shard segment, seals it, and flips the
+	// metadata back to unsealed to simulate a crash before the seal completed.
+	writeRun := func(t *testing.T) (*SegmentPath, uint32) {
+		seg, segmentPath, index := newSingleShardSegment(t)
+		for i := 0; i < n; i++ {
+			writeNoErr(t, seg, &types.PutRequest{Key: keyFor(i), Value: valueFor(i)})
+		}
+		_, err := seg.Seal(time.Now())
+		require.NoError(t, err)
+		markSegmentUnsealed(t, segmentPath, index)
+		return segmentPath, index
+	}
+
+	// assertPrefix asserts the recovered keys are exactly key000..key{survivors-1} in write order with
+	// no gaps, and that every survivor's value range fits within the (possibly truncated) value file.
+	assertPrefix := func(t *testing.T, keys []*types.ScopedKey, survivors int, valueFileSize int) {
+		require.Len(t, keys, survivors)
+		for i := 0; i < survivors; i++ {
+			require.Equal(t, string(keyFor(i)), string(keys[i].Key), "record %d", i)
+			end := int(keys[i].Address.Offset()) + int(keys[i].Address.ValueSize())
+			require.LessOrEqual(t, end, valueFileSize, "survivor %d value must fit in the value file", i)
+		}
+	}
+
+	t.Run("value_file_torn_mid_value", func(t *testing.T) {
+		t.Parallel()
+		segmentPath, index := writeRun(t)
+		// Values occupy [i*10,(i+1)*10); total 80 bytes. Truncate to 55, landing inside value 5
+		// ([50,60)). Survivors are the values whose end <= 55, i.e. key000..key004.
+		const truncatedSize = 55
+		truncateValueFileBy(t, segmentPath, index, 0, n*valueLen-truncatedSize)
+		keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index)
+		assertPrefix(t, keys, 5, truncatedSize)
+	})
+
+	t.Run("key_file_torn_mid_record", func(t *testing.T) {
+		t.Parallel()
+		segmentPath, index := writeRun(t)
+		// Keys are fixed length, so every key record is the same size. Cut 3*r-1 bytes from the tail:
+		// records for key005..key007 are removed (key005's record is left 1 byte short, so it is torn
+		// and discarded). The value file is intact, so the survivors are bounded by the key-file
+		// prefix: key000..key004.
+		r := int(keyRecordSize(keyFor(0)))
+		truncateKeyFileBy(t, segmentPath, index, 3*r-1)
+		keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index)
+		assertPrefix(t, keys, 5, n*valueLen)
+	})
+}
+
 // TestSealLoadedSegmentGroupAtomicity covers all of the torn-write scenarios that
 // sealLoadedSegment must handle. Each subtest builds a sealed segment, manually corrupts it on
 // disk to simulate a crash mid-write, flips the metadata's sealed bit back to false, then reloads

diff --git a/sei-db/db_engine/litt/table.go b/sei-db/db_engine/litt/table.go
@@ -33,6 +33,11 @@ type Table interface {
 	// of the value is 2^32 bytes. This database has been optimized under the assumption that values
 	// are generally much larger than keys. This affects performance, but not correctness.
 	//
+	// Although writes are individually atomic, the DB makes no guarantees about atomicity of multiple writes in
+	// aggregate. That is to say, if a caller writes A and then B and the DB crashes before flushing, it may be the
+	// case that B is persisted but A is not. The exception to this rule is if the sharding factor for this table
+	// is 1, in which case the database guarantees that writes become crash durable in the order they were issued.
+	//
 	// It is not safe to modify the byte slices passed to this function after the call
 	// (the key bytes, the value bytes, and every secondary key's bytes).
 	Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error
@@ -46,6 +51,12 @@ type Table interface {
 	// of a value is 2^32 bytes. This database has been optimized under the assumption that values
 	// are generally much larger than keys. This affects performance, but not correctness.
 	//
+	// Although writes in a batch are individually atomic, the DB makes no guarantees about atomicity of multiple
+	// writes in aggregate. That is to say, if a caller writes A and then B in a batch and the DB crashes before
+	// flushing, it may be the case that B is persisted but A is not. The exception to this rule is if the sharding
+	// factor for this table is 1, in which case the database guarantees that writes become crash durable in the
+	// order they were issued.
+	//
 	// It is not safe to modify the byte slices passed to this function after the call
 	// (including the key byte slices, the value byte slices, and every secondary key's bytes).
 	PutBatch(batch []*types.PutRequest) error

diff --git a/sei-db/db_engine/litt/table_config.go b/sei-db/db_engine/litt/table_config.go
@@ -27,6 +27,10 @@ type TableConfig struct {
 	// The default is 8. Must be in the range [1, MaxShardingFactor]. Storing this as a uint8 makes it structurally
 	// impossible to configure more shards than the on-disk format can address. May be changed at runtime via
 	// Table.SetShardingFactor().
+	//
+	// Normally, writes to a table are individually atomic but not atomic in aggregate. That is to say, if a caller
+	// writes A and then B and the DB crashes before flushing, it may be the case that B is persisted but A is not.
+	// However, if the sharding factor is 1, then all writes are made crash durable in the order they were issued.
 	ShardingFactor uint8
 
 	// The size of the write cache, in bytes, for the table. A write cache stores recently written values for fast

diff --git a/sei-db/ledger_db/block/block_db.go b/sei-db/ledger_db/block/block_db.go