Skip to content

Commit ce50c15

Browse files
Completing reports migration (#41)
* fix discrepancies * lenses * sql updated * lenses path * spelling fix * all metrics in the same table * aggregated tables * storage sync script * conditional path * common lenses * lint * update * bytesTotal tested * beautified * formatting * sync storage export * docs * rename * cleanup * VALIDATE_EDITORCONFIG * lint * cleanup
1 parent 7b8047e commit ce50c15

5 files changed

Lines changed: 586 additions & 111 deletions

File tree

Lines changed: 193 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,215 @@
1-
const configs = new reports.HTTPArchiveReports()
2-
const metrics = configs.listMetrics()
1+
/**
2+
* Dynamic Reports Generator
3+
*
4+
* This file automatically generates Dataform operations for HTTP Archive reports.
5+
* It creates operations for each combination of:
6+
* - Date range (from startDate to endDate)
7+
* - Metrics (defined in includes/reports.js)
8+
* - SQL types (histogram, timeseries)
9+
* - Lenses (data filters like all, top1k, wordpress, etc.)
10+
*
11+
* Each operation:
12+
* 1. Calculates metrics from crawl data
13+
* 2. Stores results in BigQuery tables
14+
* 3. Exports data to Cloud Storage as JSON
15+
*/
316

4-
const bucket = 'httparchive'
5-
const storagePath = '/reports/dev/'
17+
// Initialize configurations
18+
const httpArchiveReports = new reports.HTTPArchiveReports()
19+
const availableMetrics = httpArchiveReports.listMetrics()
20+
const availableLenses = httpArchiveReports.lenses
21+
22+
// Configuration constants
23+
const EXPORT_CONFIG = {
24+
bucket: constants.bucket,
25+
storagePath: constants.storagePath,
26+
dataset: 'reports',
27+
testSuffix: '.json'
28+
}
29+
30+
// Date range for report generation
31+
// Adjust these dates to update reports retrospectively
32+
const DATE_RANGE = {
33+
startDate: constants.currentMonth, // '2025-07-01'
34+
endDate: constants.currentMonth // '2025-07-01'
35+
}
36+
37+
/**
38+
* Generates the Cloud Storage export path for a report
39+
* @param {Object} reportConfig - Report configuration object
40+
* @returns {string} - Cloud Storage object path
41+
*/
42+
function buildExportPath(reportConfig) {
43+
const { sql, date, metric } = reportConfig
44+
let objectPath = EXPORT_CONFIG.storagePath
645

7-
function generateExportQuery (metric, sql, params, ctx) {
8-
let query = ''
946
if (sql.type === 'histogram') {
10-
query = `
11-
SELECT
12-
* EXCEPT(date)
13-
FROM ${ctx.self()}
14-
WHERE date = '${params.date}'
15-
`
47+
// Histogram exports are organized by date folders
48+
const dateFolder = date.replaceAll('-', '_')
49+
objectPath += `${dateFolder}/${metric.id}`
1650
} else if (sql.type === 'timeseries') {
17-
query = `
18-
SELECT
19-
FORMAT_DATE('%Y_%m_%d', date) AS date,
20-
* EXCEPT(date)
21-
FROM ${ctx.self()}
22-
`
51+
// Timeseries exports are organized by metric
52+
objectPath += metric.id
2353
} else {
24-
throw new Error('Unknown SQL type')
54+
throw new Error(`Unknown SQL type: ${sql.type}`)
2555
}
2656

27-
const queryOutput = query.replace(/[\r\n]+/g, ' ')
28-
return queryOutput
57+
return objectPath + EXPORT_CONFIG.testSuffix
2958
}
3059

31-
function generateExportPath (metric, sql, params) {
60+
/**
61+
* Generates the BigQuery export query for a report
62+
* @param {Object} reportConfig - Report configuration object
63+
* @returns {string} - SQL query for exporting data
64+
*/
65+
function buildExportQuery(reportConfig) {
66+
const { sql, date, metric, lens, tableName } = reportConfig
67+
68+
let query
3269
if (sql.type === 'histogram') {
33-
return `${storagePath}${params.date.replaceAll('-', '_')}/${metric.id}.json`
70+
query = `
71+
SELECT
72+
* EXCEPT(date, metric, lens)
73+
FROM \`${EXPORT_CONFIG.dataset}.${tableName}\`
74+
WHERE date = '${date}'
75+
AND metric = '${metric.id}'
76+
AND lens = '${lens.name}'
77+
ORDER BY bin ASC
78+
`
3479
} else if (sql.type === 'timeseries') {
35-
return `${storagePath}${metric.id}.json`
80+
query = `
81+
SELECT
82+
FORMAT_DATE('%Y_%m_%d', date) AS date,
83+
* EXCEPT(date, metric, lens)
84+
FROM \`${EXPORT_CONFIG.dataset}.${tableName}\`
85+
WHERE metric = '${metric.id}'
86+
AND lens = '${lens.name}'
87+
ORDER BY date DESC
88+
`
3689
} else {
37-
throw new Error('Unknown SQL type')
90+
throw new Error(`Unknown SQL type: ${sql.type}`)
3891
}
92+
93+
// Convert to single line for JSON embedding
94+
return query.replace(/[\r\n]+/g, ' ').trim()
3995
}
4096

41-
const iterations = []
42-
for (
43-
let date = constants.currentMonth; date >= constants.currentMonth; date = constants.fnPastMonth(date)) {
44-
iterations.push({
97+
/**
98+
* Creates a report configuration object
99+
* @param {string} date - Report date (YYYY-MM-DD)
100+
* @param {Object} metric - Metric configuration
101+
* @param {Object} sql - SQL configuration (type and query)
102+
* @param {string} lensName - Lens name
103+
* @param {string} lensSQL - Lens SQL filter
104+
* @returns {Object} - Complete report configuration
105+
*/
106+
function createReportConfig(date, metric, sql, lensName, lensSQL) {
107+
return {
45108
date,
46-
devRankFilter: constants.devRankFilter
47-
})
109+
metric,
110+
sql,
111+
lens: { name: lensName, sql: lensSQL },
112+
devRankFilter: constants.devRankFilter,
113+
tableName: `${metric.id}_${sql.type}`
114+
}
48115
}
49116

50-
if (iterations.length === 1) {
51-
const params = iterations[0]
52-
metrics.forEach(metric => {
53-
metric.SQL.forEach(sql => {
54-
publish(metric.id + '_' + sql.type, {
55-
type: 'incremental',
56-
protected: true,
57-
bigquery: sql.type === 'histogram' ? { partitionBy: 'date', clusterBy: ['client'] } : {},
58-
schema: 'reports'
59-
// tags: ['crawl_complete', 'http_reports']
60-
}).preOps(ctx => `
61-
--DELETE FROM ${ctx.self()}
62-
--WHERE date = '${params.date}';
63-
`).query(
64-
ctx => sql.query(ctx, params)
65-
).postOps(ctx => `
66-
SELECT
67-
reports.run_export_job(
68-
JSON '''{
69-
"destination": "cloud_storage",
70-
"config": {
71-
"bucket": "${bucket}",
72-
"name": "${generateExportPath(metric, sql, params)}"
73-
},
74-
"query": "${generateExportQuery(metric, sql, params, ctx)}"
75-
}'''
76-
);
77-
`)
78-
})
79-
})
80-
} else {
81-
iterations.forEach(params => {
82-
metrics.forEach(metric => {
117+
/**
118+
* Generates all report configurations for the specified date range
119+
* @returns {Array} - Array of report configuration objects
120+
*/
121+
function generateReportConfigurations() {
122+
const reportConfigs = []
123+
124+
// Generate configurations for each date in range
125+
for (let date = DATE_RANGE.endDate;
126+
date >= DATE_RANGE.startDate;
127+
date = constants.fnPastMonth(date)) {
128+
129+
// For each available metric
130+
availableMetrics.forEach(metric => {
131+
// For each SQL type (histogram, timeseries)
83132
metric.SQL.forEach(sql => {
84-
operate(metric.id + '_' + sql.type + '_' + params.date, {
85-
// tags: ['crawl_complete', 'http_reports']
86-
}).queries(ctx => `
87-
DELETE FROM reports.${metric.id}_${sql.type}
88-
WHERE date = '${params.date}';
89-
90-
INSERT INTO reports.${metric.id}_${sql.type}` + sql.query(ctx, params)
91-
).postOps(ctx => `
92-
SELECT
93-
reports.run_export_job(
94-
JSON '''{
95-
"destination": "cloud_storage",
96-
"config": {
97-
"bucket": "${bucket}",
98-
"name": "${generateExportPath(metric, sql, params)}"
99-
},
100-
"query": "${generateExportQuery(metric, sql, params, ctx)}"
101-
}'''
102-
);
103-
`)
133+
// For each available lens (all, top1k, wordpress, etc.)
134+
Object.entries(availableLenses).forEach(([lensName, lensSQL]) => {
135+
const config = createReportConfig(date, metric, sql, lensName, lensSQL)
136+
reportConfigs.push(config)
137+
})
104138
})
105139
})
106-
})
140+
}
141+
142+
return reportConfigs
143+
}
144+
145+
/**
146+
* Creates a Dataform operation name for a report configuration
147+
* @param {Object} reportConfig - Report configuration object
148+
* @returns {string} - Operation name
149+
*/
150+
function createOperationName(reportConfig) {
151+
const { tableName, date, lens } = reportConfig
152+
return `${tableName}_${date}_${lens.name}`
153+
}
154+
155+
/**
156+
* Generates the SQL for a Dataform operation
157+
* @param {Object} ctx - Dataform context
158+
* @param {Object} reportConfig - Report configuration object
159+
* @returns {string} - Complete SQL for the operation
160+
*/
161+
function generateOperationSQL(ctx, reportConfig) {
162+
const { date, metric, lens, sql, tableName } = reportConfig
163+
164+
return `
165+
DECLARE job_config JSON;
166+
167+
/* First report run - uncomment to create table
168+
CREATE TABLE IF NOT EXISTS ${EXPORT_CONFIG.dataset}.${tableName}
169+
PARTITION BY date
170+
CLUSTER BY metric, lens, client
171+
AS
172+
*/
173+
174+
--/* Subsequent report run
175+
DELETE FROM ${EXPORT_CONFIG.dataset}.${tableName}
176+
WHERE date = '${date}'
177+
AND metric = '${metric.id}'
178+
AND lens = '${lens.name}';
179+
INSERT INTO ${EXPORT_CONFIG.dataset}.${tableName}
180+
--*/
181+
182+
SELECT
183+
'${metric.id}' AS metric,
184+
'${lens.name}' AS lens,
185+
*
186+
FROM (
187+
${sql.query(ctx, reportConfig)}
188+
);
189+
190+
SET job_config = TO_JSON(
191+
STRUCT(
192+
"cloud_storage" AS destination,
193+
STRUCT(
194+
"httparchive" AS bucket,
195+
"${buildExportPath(reportConfig)}" AS name
196+
) AS config,
197+
r"${buildExportQuery(reportConfig)}" AS query
198+
)
199+
);
200+
201+
SELECT reports.run_export_job(job_config);
202+
`
107203
}
204+
205+
// Generate all report configurations
206+
const reportConfigurations = generateReportConfigurations()
207+
208+
// Create Dataform operations for each report configuration
209+
reportConfigurations.forEach(reportConfig => {
210+
const operationName = createOperationName(reportConfig)
211+
212+
operate(operationName)
213+
.tags(['crawl_complete', 'crawl_reports'])
214+
.queries(ctx => generateOperationSQL(ctx, reportConfig))
215+
})

includes/constants.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ class DataformTemplateBuilder {
4949
if (typeof value === 'string') return `'${value}'`
5050
if (typeof value === 'number') return value.toString()
5151
if (typeof value === 'boolean') return value.toString()
52+
if (typeof value === 'function') return value.toString()
5253

5354
// For objects or arrays, use JSON.stringify
5455
return JSON.stringify(value)

0 commit comments

Comments
 (0)