Skip to content

Commit cc98253

Browse files
authored
Implement Devnet-4 metrics from leanMetrics PR #29 (#289)
## Motivation Implements the metrics defined in [leanEthereum/leanMetrics#29](leanEthereum/leanMetrics#29) for Devnet-4 monitoring: block production, gossip message sizes, sync status, and updated histogram buckets. ## Description ### Block Production Metrics (`blockchain/metrics.rs` → instrumented in `lib.rs` + `store.rs`) | Metric | Type | Buckets | |--------|------|---------| | `lean_block_building_time_seconds` | Histogram | 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1 | | `lean_block_building_payload_aggregation_time_seconds` | Histogram | 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 4 | | `lean_block_aggregated_payloads` | Histogram | 1, 2, 4, 8, 16, 32, 64, 128 | | `lean_block_building_success_total` | Counter | — | | `lean_block_building_failures_total` | Counter | — | - `propose_block()` is wrapped with a timing guard for total block building time, and increments success/failure counters on each exit path. - `produce_block_with_signatures()` times the `build_block()` call specifically for payload aggregation, and observes the aggregated payload count. ### Sync Status (`blockchain/metrics.rs` → tracked in `lib.rs`) | Metric | Type | Labels | |--------|------|--------| | `lean_node_sync_status` | Gauge | status=idle,syncing,synced | - Set to `idle` at startup (before first tick). - Updated to `syncing` or `synced` after every block, by comparing `head_slot` against the wall clock slot. ### Gossip Message Size Metrics (`p2p/metrics.rs` → instrumented in `gossipsub/handler.rs`) | Metric | Type | Buckets | |--------|------|---------| | `lean_gossip_block_size_bytes` | Histogram | 10K, 50K, 100K, 250K, 500K, 1M, 2M, 5M | | `lean_gossip_attestation_size_bytes` | Histogram | 512, 1K, 2K, 4K, 8K, 16K | | `lean_gossip_aggregation_size_bytes` | Histogram | 1K, 4K, 16K, 64K, 128K, 256K, 512K, 1M | - Observed on the **uncompressed** message after snappy decompression, for each gossip message type. ### Modified Existing Metric - `lean_committee_signatures_aggregation_time_seconds`: buckets updated from `[0.005..1.0]` to `[0.05..4.0]` to capture longer aggregation times in Devnet-4. ## How to Test 1. `make fmt` — passes 2. `make lint` — passes (clippy clean) 3. `make test` — spec test failures are pre-existing (fixture format mismatch, unrelated to this PR) 4. Run a local devnet and verify new metrics appear on the `/metrics` endpoint (port 5054)
1 parent 0b2c281 commit cc98253

File tree

5 files changed

+214
-11
lines changed

5 files changed

+214
-11
lines changed

crates/blockchain/src/lib.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ impl BlockChain {
4747
is_aggregator: bool,
4848
) -> BlockChain {
4949
metrics::set_is_aggregator(is_aggregator);
50+
metrics::set_node_sync_status(metrics::SyncStatus::Idle);
5051
let genesis_time = store.config().genesis_time;
5152
let key_manager = key_manager::KeyManager::new(validator_keys);
5253
let handle = BlockChainServer {
@@ -217,11 +218,14 @@ impl BlockChainServer {
217218
fn propose_block(&mut self, slot: u64, validator_id: u64) {
218219
info!(%slot, %validator_id, "We are the proposer for this slot");
219220

221+
let _timing = metrics::time_block_building();
222+
220223
// Build the block with attestation signatures
221224
let Ok((block, attestation_signatures, _post_checkpoints)) =
222225
store::produce_block_with_signatures(&mut self.store, slot, validator_id)
223226
.inspect_err(|err| error!(%slot, %validator_id, %err, "Failed to build block"))
224227
else {
228+
metrics::inc_block_building_failures();
225229
return;
226230
};
227231

@@ -232,6 +236,7 @@ impl BlockChainServer {
232236
.sign_block_root(validator_id, slot as u32, &block_root)
233237
.inspect_err(|err| error!(%slot, %validator_id, %err, "Failed to sign block root"))
234238
else {
239+
metrics::inc_block_building_failures();
235240
return;
236241
};
237242

@@ -249,9 +254,12 @@ impl BlockChainServer {
249254
// Process the block locally before publishing
250255
if let Err(err) = self.process_block(signed_block.clone()) {
251256
error!(%slot, %validator_id, %err, "Failed to process built block");
257+
metrics::inc_block_building_failures();
252258
return;
253259
};
254260

261+
metrics::inc_block_building_success();
262+
255263
// Publish to gossip network
256264
if let Some(ref p2p) = self.p2p {
257265
let _ = p2p
@@ -264,10 +272,21 @@ impl BlockChainServer {
264272

265273
fn process_block(&mut self, signed_block: SignedBlock) -> Result<(), StoreError> {
266274
store::on_block(&mut self.store, signed_block)?;
267-
metrics::update_head_slot(self.store.head_slot());
275+
let head_slot = self.store.head_slot();
276+
metrics::update_head_slot(head_slot);
268277
metrics::update_latest_justified_slot(self.store.latest_justified().slot);
269278
metrics::update_latest_finalized_slot(self.store.latest_finalized().slot);
270279
metrics::update_validators_count(self.key_manager.validator_ids().len() as u64);
280+
281+
// Update sync status based on head slot vs wall clock slot
282+
let current_slot = self.store.time() / INTERVALS_PER_SLOT;
283+
let status = if head_slot >= current_slot {
284+
metrics::SyncStatus::Synced
285+
} else {
286+
metrics::SyncStatus::Syncing
287+
};
288+
metrics::set_node_sync_status(status);
289+
271290
for table in ALL_TABLES {
272291
metrics::update_table_bytes(table.name(), self.store.estimate_table_bytes(table));
273292
}

crates/blockchain/src/metrics.rs

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ static LEAN_COMMITTEE_SIGNATURES_AGGREGATION_TIME_SECONDS: std::sync::LazyLock<H
261261
register_histogram!(
262262
"lean_committee_signatures_aggregation_time_seconds",
263263
"Time taken to aggregate committee signatures",
264-
vec![0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
264+
vec![0.05, 0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0]
265265
)
266266
.unwrap()
267267
});
@@ -276,6 +276,77 @@ static LEAN_FORK_CHOICE_REORG_DEPTH: std::sync::LazyLock<Histogram> =
276276
.unwrap()
277277
});
278278

279+
// --- Block Production ---
280+
281+
static LEAN_BLOCK_AGGREGATED_PAYLOADS: std::sync::LazyLock<Histogram> =
282+
std::sync::LazyLock::new(|| {
283+
register_histogram!(
284+
"lean_block_aggregated_payloads",
285+
"Number of aggregated_payloads in a block",
286+
vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
287+
)
288+
.unwrap()
289+
});
290+
291+
static LEAN_BLOCK_BUILDING_PAYLOAD_AGGREGATION_TIME_SECONDS: std::sync::LazyLock<Histogram> =
292+
std::sync::LazyLock::new(|| {
293+
register_histogram!(
294+
"lean_block_building_payload_aggregation_time_seconds",
295+
"Time taken to build aggregated_payloads during block building",
296+
vec![0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0]
297+
)
298+
.unwrap()
299+
});
300+
301+
static LEAN_BLOCK_BUILDING_TIME_SECONDS: std::sync::LazyLock<Histogram> =
302+
std::sync::LazyLock::new(|| {
303+
register_histogram!(
304+
"lean_block_building_time_seconds",
305+
"Time taken to build a block",
306+
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
307+
)
308+
.unwrap()
309+
});
310+
311+
static LEAN_BLOCK_BUILDING_SUCCESS_TOTAL: std::sync::LazyLock<IntCounter> =
312+
std::sync::LazyLock::new(|| {
313+
register_int_counter!(
314+
"lean_block_building_success_total",
315+
"Successful block builds"
316+
)
317+
.unwrap()
318+
});
319+
320+
static LEAN_BLOCK_BUILDING_FAILURES_TOTAL: std::sync::LazyLock<IntCounter> =
321+
std::sync::LazyLock::new(|| {
322+
register_int_counter!("lean_block_building_failures_total", "Failed block builds").unwrap()
323+
});
324+
325+
// --- Sync Status ---
326+
327+
/// Node synchronization status.
328+
pub enum SyncStatus {
329+
Idle,
330+
Syncing,
331+
Synced,
332+
}
333+
334+
impl SyncStatus {
335+
fn as_str(&self) -> &'static str {
336+
match self {
337+
SyncStatus::Idle => "idle",
338+
SyncStatus::Syncing => "syncing",
339+
SyncStatus::Synced => "synced",
340+
}
341+
}
342+
343+
const ALL: &[&str] = &["idle", "syncing", "synced"];
344+
}
345+
346+
static LEAN_NODE_SYNC_STATUS: std::sync::LazyLock<IntGaugeVec> = std::sync::LazyLock::new(|| {
347+
register_int_gauge_vec!("lean_node_sync_status", "Node sync status", &["status"]).unwrap()
348+
});
349+
279350
// --- Initialization ---
280351

281352
/// Register all metrics with the Prometheus registry so they appear in `/metrics` from startup.
@@ -315,6 +386,14 @@ pub fn init() {
315386
std::sync::LazyLock::force(&LEAN_PQ_SIG_AGGREGATED_SIGNATURES_VERIFICATION_TIME_SECONDS);
316387
std::sync::LazyLock::force(&LEAN_COMMITTEE_SIGNATURES_AGGREGATION_TIME_SECONDS);
317388
std::sync::LazyLock::force(&LEAN_FORK_CHOICE_REORG_DEPTH);
389+
// Block production
390+
std::sync::LazyLock::force(&LEAN_BLOCK_AGGREGATED_PAYLOADS);
391+
std::sync::LazyLock::force(&LEAN_BLOCK_BUILDING_PAYLOAD_AGGREGATION_TIME_SECONDS);
392+
std::sync::LazyLock::force(&LEAN_BLOCK_BUILDING_TIME_SECONDS);
393+
std::sync::LazyLock::force(&LEAN_BLOCK_BUILDING_SUCCESS_TOTAL);
394+
std::sync::LazyLock::force(&LEAN_BLOCK_BUILDING_FAILURES_TOTAL);
395+
// Sync status
396+
std::sync::LazyLock::force(&LEAN_NODE_SYNC_STATUS);
318397
}
319398

320399
// --- Public API ---
@@ -476,3 +555,38 @@ pub fn set_attestation_committee_count(count: u64) {
476555
pub fn observe_fork_choice_reorg_depth(depth: u64) {
477556
LEAN_FORK_CHOICE_REORG_DEPTH.observe(depth as f64);
478557
}
558+
559+
/// Observe the number of aggregated payloads in a built block.
560+
pub fn observe_block_aggregated_payloads(count: usize) {
561+
LEAN_BLOCK_AGGREGATED_PAYLOADS.observe(count as f64);
562+
}
563+
564+
/// Start timing payload aggregation during block building. Records duration when the guard is dropped.
565+
pub fn time_block_building_payload_aggregation() -> TimingGuard {
566+
TimingGuard::new(&LEAN_BLOCK_BUILDING_PAYLOAD_AGGREGATION_TIME_SECONDS)
567+
}
568+
569+
/// Start timing block building. Records duration when the guard is dropped.
570+
pub fn time_block_building() -> TimingGuard {
571+
TimingGuard::new(&LEAN_BLOCK_BUILDING_TIME_SECONDS)
572+
}
573+
574+
/// Increment the successful block builds counter.
575+
pub fn inc_block_building_success() {
576+
LEAN_BLOCK_BUILDING_SUCCESS_TOTAL.inc();
577+
}
578+
579+
/// Increment the failed block builds counter.
580+
pub fn inc_block_building_failures() {
581+
LEAN_BLOCK_BUILDING_FAILURES_TOTAL.inc();
582+
}
583+
584+
/// Set the node sync status. Sets the given status label to 1 and all others to 0.
585+
pub fn set_node_sync_status(status: SyncStatus) {
586+
let active = status.as_str();
587+
for label in SyncStatus::ALL {
588+
LEAN_NODE_SYNC_STATUS
589+
.with_label_values(&[label])
590+
.set(i64::from(*label == active));
591+
}
592+
}

crates/blockchain/src/store.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,14 +1015,19 @@ pub fn produce_block_with_signatures(
10151015

10161016
let known_block_roots = store.get_block_roots();
10171017

1018-
let (block, signatures, post_checkpoints) = build_block(
1019-
&head_state,
1020-
slot,
1021-
validator_index,
1022-
head_root,
1023-
&known_block_roots,
1024-
&aggregated_payloads,
1025-
)?;
1018+
let (block, signatures, post_checkpoints) = {
1019+
let _timing = metrics::time_block_building_payload_aggregation();
1020+
build_block(
1021+
&head_state,
1022+
slot,
1023+
validator_index,
1024+
head_root,
1025+
&known_block_roots,
1026+
&aggregated_payloads,
1027+
)?
1028+
};
1029+
1030+
metrics::observe_block_aggregated_payloads(signatures.len());
10261031

10271032
Ok((block, signatures, post_checkpoints))
10281033
}

crates/net/p2p/src/gossipsub/handler.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use super::{
1515
attestation_subnet_topic,
1616
},
1717
};
18-
use crate::P2PServer;
18+
use crate::{P2PServer, metrics};
1919

2020
pub async fn handle_gossipsub_message(server: &mut P2PServer, event: Event) {
2121
let Event::Message {
@@ -34,6 +34,7 @@ pub async fn handle_gossipsub_message(server: &mut P2PServer, event: Event) {
3434
else {
3535
return;
3636
};
37+
metrics::observe_gossip_block_size(uncompressed_data.len());
3738

3839
let Ok(signed_block) = SignedBlock::from_ssz_bytes(&uncompressed_data)
3940
.inspect_err(|err| error!(?err, "Failed to decode gossipped block"))
@@ -65,6 +66,7 @@ pub async fn handle_gossipsub_message(server: &mut P2PServer, event: Event) {
6566
else {
6667
return;
6768
};
69+
metrics::observe_gossip_aggregation_size(uncompressed_data.len());
6870

6971
let Ok(aggregation) = SignedAggregatedAttestation::from_ssz_bytes(&uncompressed_data)
7072
.inspect_err(|err| error!(?err, "Failed to decode gossipped aggregation"))
@@ -94,6 +96,7 @@ pub async fn handle_gossipsub_message(server: &mut P2PServer, event: Event) {
9496
else {
9597
return;
9698
};
99+
metrics::observe_gossip_attestation_size(uncompressed_data.len());
97100

98101
let Ok(signed_attestation) = SignedAttestation::from_ssz_bytes(&uncompressed_data)
99102
.inspect_err(|err| error!(?err, "Failed to decode gossipped attestation"))

crates/net/p2p/src/metrics.rs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,68 @@ static LEAN_PEER_DISCONNECTION_EVENTS_TOTAL: LazyLock<IntCounterVec> = LazyLock:
6868
.unwrap()
6969
});
7070

71+
// --- Gossip Message Size Histograms ---
72+
73+
static LEAN_GOSSIP_BLOCK_SIZE_BYTES: LazyLock<Histogram> = LazyLock::new(|| {
74+
register_histogram!(
75+
"lean_gossip_block_size_bytes",
76+
"Bytes size of a gossip block message",
77+
vec![
78+
10_000.0,
79+
50_000.0,
80+
100_000.0,
81+
250_000.0,
82+
500_000.0,
83+
1_000_000.0,
84+
2_000_000.0,
85+
5_000_000.0
86+
]
87+
)
88+
.unwrap()
89+
});
90+
91+
static LEAN_GOSSIP_ATTESTATION_SIZE_BYTES: LazyLock<Histogram> = LazyLock::new(|| {
92+
register_histogram!(
93+
"lean_gossip_attestation_size_bytes",
94+
"Bytes size of a gossip attestation message",
95+
vec![512.0, 1024.0, 2048.0, 4096.0, 8192.0, 16384.0]
96+
)
97+
.unwrap()
98+
});
99+
100+
static LEAN_GOSSIP_AGGREGATION_SIZE_BYTES: LazyLock<Histogram> = LazyLock::new(|| {
101+
register_histogram!(
102+
"lean_gossip_aggregation_size_bytes",
103+
"Bytes size of a gossip aggregated attestation message",
104+
vec![
105+
1024.0,
106+
4096.0,
107+
16384.0,
108+
65536.0,
109+
131_072.0,
110+
262_144.0,
111+
524_288.0,
112+
1_048_576.0
113+
]
114+
)
115+
.unwrap()
116+
});
117+
118+
/// Observe the size of a gossip block message.
119+
pub fn observe_gossip_block_size(bytes: usize) {
120+
LEAN_GOSSIP_BLOCK_SIZE_BYTES.observe(bytes as f64);
121+
}
122+
123+
/// Observe the size of a gossip attestation message.
124+
pub fn observe_gossip_attestation_size(bytes: usize) {
125+
LEAN_GOSSIP_ATTESTATION_SIZE_BYTES.observe(bytes as f64);
126+
}
127+
128+
/// Observe the size of a gossip aggregated attestation message.
129+
pub fn observe_gossip_aggregation_size(bytes: usize) {
130+
LEAN_GOSSIP_AGGREGATION_SIZE_BYTES.observe(bytes as f64);
131+
}
132+
71133
/// Set the attestation committee subnet gauge.
72134
pub fn set_attestation_committee_subnet(subnet_id: u64) {
73135
static LEAN_ATTESTATION_COMMITTEE_SUBNET: LazyLock<IntGauge> = LazyLock::new(|| {

0 commit comments

Comments
 (0)