Skip to content

Commit 04e7e86

Browse files
Crawl columns descriptions across all table variants (#213)
* Refactor output definitions to use centralized column descriptions and create new latest and sample_data files * Add descriptions for columns in the WebPageTest data structure * lint
1 parent 625eb81 commit 04e7e86

11 files changed

Lines changed: 194 additions & 80 deletions

File tree

.github/linters/eslint.config.mjs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ export default [
2222
ctx: 'readonly',
2323
constants: 'readonly',
2424
reports: 'readonly',
25-
reservations: 'readonly'
25+
reservations: 'readonly',
26+
descriptions: 'readonly'
2627
}
2728
},
2829
rules: {

definitions/output/crawl/pages.js

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
const columns = descriptions.columns.pages
2+
13
// See https://github.com/HTTPArchive/dataform/issues/43
24
assert('corrupted_technology_values')
35
.tags(['crawl_complete'])
@@ -53,46 +55,7 @@ publish('pages', {
5355
clusterBy: ['client', 'is_root_page', 'rank', 'page'],
5456
requirePartitionFilter: true
5557
},
56-
columns: {
57-
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
58-
client: 'Test environment: desktop or mobile',
59-
page: 'The URL of the page being tested',
60-
is_root_page: 'Whether the page is the root of the origin',
61-
root_page: 'The URL of the root page being tested, the origin followed by /',
62-
rank: 'Site popularity rank, from CrUX',
63-
wptid: 'ID of the WebPageTest results',
64-
payload: 'JSON-encoded WebPageTest results for the page',
65-
summary: 'JSON-encoded summarization of the page-level data',
66-
custom_metrics: {
67-
description: 'Custom metrics from WebPageTest',
68-
columns: {
69-
a11y: 'JSON-encoded A11Y metrics',
70-
cms: 'JSON-encoded CMS detection',
71-
cookies: 'JSON-encoded cookie metrics',
72-
css_variables: 'JSON-encoded CSS variable metrics',
73-
ecommerce: 'JSON-encoded ecommerce metrics',
74-
element_count: 'JSON-encoded element count metrics',
75-
javascript: 'JSON-encoded JavaScript metrics',
76-
markup: 'JSON-encoded markup metrics',
77-
media: 'JSON-encoded media metrics',
78-
origin_trials: 'JSON-encoded origin trial metrics',
79-
performance: 'JSON-encoded performance metrics',
80-
privacy: 'JSON-encoded privacy metrics',
81-
responsive_images: 'JSON-encoded responsive image metrics',
82-
robots_txt: 'JSON-encoded robots.txt metrics',
83-
security: 'JSON-encoded security metrics',
84-
structured_data: 'JSON-encoded structured data metrics',
85-
third_parties: 'JSON-encoded third-party metrics',
86-
well_known: 'JSON-encoded well-known metrics',
87-
wpt_bodies: 'JSON-encoded WebPageTest bodies',
88-
other: 'JSON-encoded other custom metrics'
89-
}
90-
},
91-
lighthouse: 'JSON-encoded Lighthouse report',
92-
features: 'Blink features detected at runtime (see https://chromestatus.com/features)',
93-
technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)',
94-
metadata: 'Additional metadata about the test'
95-
},
58+
columns: columns,
9659
tags: ['crawl_complete'],
9760
dependOnDependencyAssertions: true
9861
}).preOps(ctx => `

definitions/output/crawl/parsed_css.js

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
const columns = descriptions.columns.parsed_css
2+
13
publish('parsed_css', {
24
type: 'incremental',
35
protected: true,
@@ -7,16 +9,7 @@ publish('parsed_css', {
79
clusterBy: ['client', 'is_root_page', 'rank', 'page'],
810
requirePartitionFilter: true
911
},
10-
columns: {
11-
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
12-
client: 'Test environment: desktop or mobile',
13-
page: 'The URL of the page being tested',
14-
is_root_page: 'Whether the page is the root of the origin.',
15-
root_page: 'The URL of the root page being tested',
16-
rank: 'Site popularity rank, from CrUX',
17-
url: 'The URL of the request',
18-
css: 'The parsed CSS, in JSON format'
19-
},
12+
columns: columns,
2013
tags: ['crawl_complete']
2114
}).preOps(ctx => `
2215
${reservations.reservation_setter(ctx)}

definitions/output/crawl/requests.js

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
const columns = descriptions.columns.requests
2+
13
publish('requests', {
24
type: 'incremental',
35
protected: true,
@@ -7,35 +9,7 @@ publish('requests', {
79
clusterBy: ['client', 'is_root_page', 'type', 'rank'],
810
requirePartitionFilter: true
911
},
10-
columns: {
11-
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
12-
client: 'Test environment: desktop or mobile',
13-
page: 'The URL of the page being tested',
14-
is_root_page: 'Whether the page is the root of the origin.',
15-
root_page: 'The URL of the root page being tested',
16-
rank: 'Site popularity rank, from CrUX',
17-
url: 'The URL of the request',
18-
is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects',
19-
type: 'Simplified description of the type of resource (script, html, css, text, other, etc)',
20-
index: 'The sequential 0-based index of the request',
21-
payload: 'JSON-encoded WebPageTest result data for this request',
22-
summary: 'JSON-encoded summarization of request data',
23-
request_headers: {
24-
description: 'Request headers',
25-
columns: {
26-
name: 'Request header name',
27-
value: 'Request header value'
28-
}
29-
},
30-
response_headers: {
31-
description: 'Response headers',
32-
columns: {
33-
name: 'Response header name',
34-
value: 'Response header value'
35-
}
36-
},
37-
response_body: 'Text-based response body'
38-
},
12+
columns: columns,
3913
tags: ['crawl_complete']
4014
}).preOps(ctx => `
4115
${reservations.reservation_setter(ctx)}

definitions/output/latest/pages.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
const columns = descriptions.columns.pages
2+
3+
publish('pages', {
4+
type: 'view',
5+
schema: 'latest',
6+
columns: columns
7+
}).query(ctx => `
8+
SELECT
9+
*
10+
FROM ${ctx.ref('crawl', 'pages')}
11+
WHERE
12+
date = (
13+
SELECT
14+
PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
15+
FROM
16+
httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
17+
WHERE
18+
table_name = 'pages' AND
19+
/* Only include actual dates in partition ids */
20+
partition_id >= '20250101' AND
21+
partition_id < '20990101' AND
22+
/* Exclude future dates - shouldn't be any, but you never know! */
23+
partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
24+
) AND
25+
/* The following should help make this even faster since above query is a little complex */
26+
/* We should never be more than 60 days old hopefully! */
27+
date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
28+
date <= CURRENT_DATE()
29+
`)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
const columns = descriptions.columns.parsed_css
2+
3+
publish('parsed_css', {
4+
type: 'view',
5+
schema: 'latest',
6+
columns: columns,
7+
}).query(ctx => `
8+
SELECT
9+
*
10+
FROM ${ctx.ref('crawl', 'parsed_css')}
11+
WHERE
12+
date = (
13+
SELECT
14+
PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
15+
FROM
16+
httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
17+
WHERE
18+
table_name = 'parsed_css' AND
19+
/* Only include actual dates in partition ids */
20+
partition_id >= '20250101' AND
21+
partition_id < '20990101' AND
22+
/* Exclude future dates - shouldn't be any, but you never know! */
23+
partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
24+
) AND
25+
/* The following should help make this even faster since above query is a little complex */
26+
/* We should never be more than 60 days old hopefully! */
27+
date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
28+
date <= CURRENT_DATE()
29+
`)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
const columns = descriptions.columns.requests
2+
3+
publish('requests', {
4+
type: 'view',
5+
schema: 'latest',
6+
columns: columns,
7+
}).query(ctx => `
8+
SELECT
9+
*
10+
FROM
11+
${ctx.ref('crawl', 'requests')}
12+
WHERE
13+
date = (
14+
SELECT
15+
PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
16+
FROM
17+
httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
18+
WHERE
19+
table_name = 'requests' AND
20+
/* Only include actual dates in partition ids */
21+
partition_id >= '20250101' AND
22+
partition_id < '20990101' AND
23+
/* Exclude future dates - shouldn't be any, but you never know! */
24+
partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
25+
) AND
26+
/* The following should help make this even faster since above query is a little complex */
27+
/* We should never be more than 60 days old hopefully! */
28+
date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
29+
date <= CURRENT_DATE()
30+
`)

definitions/output/sample_data/pages_10k.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
const columns = descriptions.columns.pages
2+
13
publish('pages_10k', {
24
type: 'table',
35
schema: 'sample_data',
46
bigquery: {
57
partitionBy: 'date',
68
clusterBy: ['client', 'is_root_page', 'rank', 'page']
79
},
10+
columns: columns,
811
tags: ['crawl_complete']
912
}).query(ctx => `
1013
SELECT *

definitions/output/sample_data/parsed_css_10k.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
const columns = descriptions.columns.parsed_css
2+
13
publish('parsed_css_10k', {
24
type: 'table',
35
schema: 'sample_data',
46
bigquery: {
57
partitionBy: 'date',
68
clusterBy: ['client', 'is_root_page', 'rank', 'page']
79
},
10+
columns: columns,
811
tags: ['crawl_complete']
912
}).query(ctx => `
1013
SELECT *

definitions/output/sample_data/requests_10k.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
const columns = descriptions.columns.requests
2+
13
publish('requests_10k', {
24
type: 'table',
35
schema: 'sample_data',
46
bigquery: {
57
partitionBy: 'date',
68
clusterBy: ['client', 'is_root_page', 'rank', 'type']
79
},
10+
columns: columns,
811
tags: ['crawl_complete']
912
}).query(ctx => `
1013
SELECT *

0 commit comments

Comments
 (0)