|
1 | | -const configs = new reports.HTTPArchiveReports() |
2 | | -const metrics = configs.listMetrics() |
3 | | -const lenses = configs.lenses |
4 | | - |
5 | | -const bucket = constants.bucket |
6 | | -const storagePath = constants.storagePath |
7 | | -const dataset = 'reports' |
8 | | - |
9 | | -// Adjust start and end dates to update reports retrospectively |
10 | | -const startDate = constants.currentMonth; // '2025-07-01' |
11 | | -const endDate = constants.currentMonth; // '2025-07-01' |
12 | | - |
13 | | -function generateExportPath (params) { |
14 | | - objectName = storagePath |
15 | | - if (params.sql.type === 'histogram') { |
16 | | - objectName = objectName + params.date.replaceAll('-', '_') + '/' + params.metric.id |
17 | | - } else if (params.sql.type === 'timeseries') { |
18 | | - objectName = objectName + params.metric.id |
| 1 | +/** |
| 2 | + * Dynamic Reports Generator |
| 3 | + * |
| 4 | + * This file automatically generates Dataform operations for HTTP Archive reports. |
| 5 | + * It creates operations for each combination of: |
| 6 | + * - Date range (from startDate to endDate) |
| 7 | + * - Metrics (defined in includes/reports.js) |
| 8 | + * - SQL types (histogram, timeseries) |
| 9 | + * - Lenses (data filters like all, top1k, wordpress, etc.) |
| 10 | + * |
| 11 | + * Each operation: |
| 12 | + * 1. Calculates metrics from crawl data |
| 13 | + * 2. Stores results in BigQuery tables |
| 14 | + * 3. Exports data to Cloud Storage as JSON |
| 15 | + */ |
| 16 | + |
| 17 | +// Initialize configurations |
| 18 | +const httpArchiveReports = new reports.HTTPArchiveReports() |
| 19 | +const availableMetrics = httpArchiveReports.listMetrics() |
| 20 | +const availableLenses = httpArchiveReports.lenses |
| 21 | + |
| 22 | +// Configuration constants |
| 23 | +const EXPORT_CONFIG = { |
| 24 | + bucket: constants.bucket, |
| 25 | + storagePath: constants.storagePath, |
| 26 | + dataset: 'reports', |
| 27 | + testSuffix: '_test.json' // TODO: remove test suffix from the path |
| 28 | +} |
| 29 | + |
| 30 | +// Date range for report generation |
| 31 | +// Adjust these dates to update reports retrospectively |
| 32 | +const DATE_RANGE = { |
| 33 | + startDate: constants.currentMonth, // '2025-07-01' |
| 34 | + endDate: constants.currentMonth // '2025-07-01' |
| 35 | +} |
| 36 | + |
| 37 | +/** |
| 38 | + * Generates the Cloud Storage export path for a report |
| 39 | + * @param {Object} reportConfig - Report configuration object |
| 40 | + * @returns {string} - Cloud Storage object path |
| 41 | + */ |
| 42 | +function buildExportPath(reportConfig) { |
| 43 | + const { sql, date, metric } = reportConfig |
| 44 | + let objectPath = EXPORT_CONFIG.storagePath |
| 45 | + |
| 46 | + if (sql.type === 'histogram') { |
| 47 | + // Histogram exports are organized by date folders |
| 48 | + const dateFolder = date.replaceAll('-', '_') |
| 49 | + objectPath += `${dateFolder}/${metric.id}` |
| 50 | + } else if (sql.type === 'timeseries') { |
| 51 | + // Timeseries exports are organized by metric |
| 52 | + objectPath += metric.id |
19 | 53 | } else { |
20 | | - throw new Error('Unknown SQL type') |
| 54 | + throw new Error(`Unknown SQL type: ${sql.type}`) |
21 | 55 | } |
22 | | - return objectName + '_test.json' // TODO: remove test suffix from the path |
| 56 | + |
| 57 | + return objectPath + EXPORT_CONFIG.testSuffix |
23 | 58 | } |
24 | 59 |
|
25 | | -function generateExportQuery (params) { |
26 | | - let query = '' |
27 | | - if (params.sql.type === 'histogram') { |
| 60 | +/** |
| 61 | + * Generates the BigQuery export query for a report |
| 62 | + * @param {Object} reportConfig - Report configuration object |
| 63 | + * @returns {string} - SQL query for exporting data |
| 64 | + */ |
| 65 | +function buildExportQuery(reportConfig) { |
| 66 | + const { sql, date, metric, lens, tableName } = reportConfig |
| 67 | + |
| 68 | + let query |
| 69 | + if (sql.type === 'histogram') { |
28 | 70 | query = ` |
29 | | -SELECT |
30 | | - * EXCEPT(date, metric, lens) |
31 | | -FROM \`${dataset}.${params.tableName}\` |
32 | | -WHERE date = '${params.date}' |
33 | | - AND metric = '${params.metric.id}' |
34 | | - AND lens = '${params.lens.name}' |
35 | | -ORDER BY bin ASC |
36 | | -` |
37 | | - } else if (params.sql.type === 'timeseries') { |
| 71 | + SELECT |
| 72 | + * EXCEPT(date, metric, lens) |
| 73 | + FROM \`${EXPORT_CONFIG.dataset}.${tableName}\` |
| 74 | + WHERE date = '${date}' |
| 75 | + AND metric = '${metric.id}' |
| 76 | + AND lens = '${lens.name}' |
| 77 | + ORDER BY bin ASC |
| 78 | + ` |
| 79 | + } else if (sql.type === 'timeseries') { |
38 | 80 | query = ` |
39 | | -SELECT |
40 | | - FORMAT_DATE('%Y_%m_%d', date) AS date, |
41 | | - * EXCEPT(date, metric, lens) |
42 | | -FROM \`${dataset}.${params.tableName}\` |
43 | | -WHERE metric = '${params.metric.id}' |
44 | | - AND lens = '${params.lens.name}' |
45 | | -ORDER BY date DESC |
46 | | -` |
| 81 | + SELECT |
| 82 | + FORMAT_DATE('%Y_%m_%d', date) AS date, |
| 83 | + * EXCEPT(date, metric, lens) |
| 84 | + FROM \`${EXPORT_CONFIG.dataset}.${tableName}\` |
| 85 | + WHERE metric = '${metric.id}' |
| 86 | + AND lens = '${lens.name}' |
| 87 | + ORDER BY date DESC |
| 88 | + ` |
47 | 89 | } else { |
48 | | - throw new Error('Unknown SQL type') |
| 90 | + throw new Error(`Unknown SQL type: ${sql.type}`) |
49 | 91 | } |
50 | 92 |
|
51 | | - const queryOutput = query.replace(/[\r\n]+/g, ' ') |
52 | | - return queryOutput |
| 93 | + // Convert to single line for JSON embedding |
| 94 | + return query.replace(/[\r\n]+/g, ' ').trim() |
53 | 95 | } |
54 | 96 |
|
55 | | -const iterations = [] |
56 | | -// dates |
57 | | -for ( |
58 | | - let date = endDate; |
59 | | - date >= startDate; |
60 | | - date = constants.fnPastMonth(date) |
61 | | -) { |
62 | | - // metrics |
63 | | - metrics.forEach(metric => { |
64 | | - // timeseries and histograms |
65 | | - metric.SQL.forEach(sql => { |
66 | | - // lenses |
67 | | - for (const [key, value] of Object.entries(lenses)) { |
68 | | - iterations.push({ |
69 | | - date, |
70 | | - metric, |
71 | | - sql, |
72 | | - lens: { name: key, sql: value }, |
73 | | - devRankFilter: constants.devRankFilter, |
74 | | - tableName: metric.id + '_' + sql.type |
| 97 | +/** |
| 98 | + * Creates a report configuration object |
| 99 | + * @param {string} date - Report date (YYYY-MM-DD) |
| 100 | + * @param {Object} metric - Metric configuration |
| 101 | + * @param {Object} sql - SQL configuration (type and query) |
| 102 | + * @param {string} lensName - Lens name |
| 103 | + * @param {string} lensSQL - Lens SQL filter |
| 104 | + * @returns {Object} - Complete report configuration |
| 105 | + */ |
| 106 | +function createReportConfig(date, metric, sql, lensName, lensSQL) { |
| 107 | + return { |
| 108 | + date, |
| 109 | + metric, |
| 110 | + sql, |
| 111 | + lens: { name: lensName, sql: lensSQL }, |
| 112 | + devRankFilter: constants.devRankFilter, |
| 113 | + tableName: `${metric.id}_${sql.type}` |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +/** |
| 118 | + * Generates all report configurations for the specified date range |
| 119 | + * @returns {Array} - Array of report configuration objects |
| 120 | + */ |
| 121 | +function generateReportConfigurations() { |
| 122 | + const reportConfigs = [] |
| 123 | + |
| 124 | + // Generate configurations for each date in range |
| 125 | + for (let date = DATE_RANGE.endDate; |
| 126 | + date >= DATE_RANGE.startDate; |
| 127 | + date = constants.fnPastMonth(date)) { |
| 128 | + |
| 129 | + // For each available metric |
| 130 | + availableMetrics.forEach(metric => { |
| 131 | + // For each SQL type (histogram, timeseries) |
| 132 | + metric.SQL.forEach(sql => { |
| 133 | + // For each available lens (all, top1k, wordpress, etc.) |
| 134 | + Object.entries(availableLenses).forEach(([lensName, lensSQL]) => { |
| 135 | + const config = createReportConfig(date, metric, sql, lensName, lensSQL) |
| 136 | + reportConfigs.push(config) |
75 | 137 | }) |
76 | | - } |
| 138 | + }) |
77 | 139 | }) |
78 | | - }) |
| 140 | + } |
| 141 | + |
| 142 | + return reportConfigs |
| 143 | +} |
| 144 | + |
| 145 | +/** |
| 146 | + * Creates a Dataform operation name for a report configuration |
| 147 | + * @param {Object} reportConfig - Report configuration object |
| 148 | + * @returns {string} - Operation name |
| 149 | + */ |
| 150 | +function createOperationName(reportConfig) { |
| 151 | + const { tableName, date, lens } = reportConfig |
| 152 | + return `${tableName}_${date}_${lens.name}` |
| 153 | +} |
| 154 | + |
| 155 | +/** |
| 156 | + * Generates the SQL for a Dataform operation |
| 157 | + * @param {Object} ctx - Dataform context |
| 158 | + * @param {Object} reportConfig - Report configuration object |
| 159 | + * @returns {string} - Complete SQL for the operation |
| 160 | + */ |
| 161 | +function generateOperationSQL(ctx, reportConfig) { |
| 162 | + const { date, metric, lens, sql, tableName } = reportConfig |
| 163 | + |
| 164 | + return ` |
| 165 | + DECLARE job_config JSON; |
| 166 | +
|
| 167 | + /* First report run - uncomment to create table |
| 168 | + CREATE TABLE IF NOT EXISTS ${EXPORT_CONFIG.dataset}.${tableName} |
| 169 | + PARTITION BY date |
| 170 | + CLUSTER BY metric, lens, client |
| 171 | + AS |
| 172 | + */ |
| 173 | +
|
| 174 | + --/* Subsequent report run |
| 175 | + DELETE FROM ${EXPORT_CONFIG.dataset}.${tableName} |
| 176 | + WHERE date = '${date}' |
| 177 | + AND metric = '${metric.id}' |
| 178 | + AND lens = '${lens.name}'; |
| 179 | + INSERT INTO ${EXPORT_CONFIG.dataset}.${tableName} |
| 180 | + --*/ |
| 181 | +
|
| 182 | + SELECT |
| 183 | + '${metric.id}' AS metric, |
| 184 | + '${lens.name}' AS lens, |
| 185 | + * |
| 186 | + FROM ( |
| 187 | + ${sql.query(ctx, reportConfig)} |
| 188 | + ); |
| 189 | +
|
| 190 | + SET job_config = TO_JSON( |
| 191 | + STRUCT( |
| 192 | + "cloud_storage" AS destination, |
| 193 | + STRUCT( |
| 194 | + "httparchive" AS bucket, |
| 195 | + "${buildExportPath(reportConfig)}" AS name |
| 196 | + ) AS config, |
| 197 | + r"${buildExportQuery(reportConfig)}" AS query |
| 198 | + ) |
| 199 | + ); |
| 200 | +
|
| 201 | + SELECT reports.run_export_job(job_config); |
| 202 | + ` |
79 | 203 | } |
80 | 204 |
|
81 | | -iterations.forEach((params, i) => { |
82 | | - operate(params.tableName + '_' + params.date + '_' + params.lens.name) |
| 205 | +// Generate all report configurations |
| 206 | +const reportConfigurations = generateReportConfigurations() |
| 207 | + |
| 208 | +// Create Dataform operations for each report configuration |
| 209 | +reportConfigurations.forEach(reportConfig => { |
| 210 | + const operationName = createOperationName(reportConfig) |
| 211 | + |
| 212 | + operate(operationName) |
83 | 213 | .tags(['crawl_complete', 'reports']) |
84 | | - .queries(ctx => ` |
85 | | -DECLARE job_config JSON; |
86 | | -
|
87 | | -/* First report run |
88 | | -CREATE TABLE IF NOT EXISTS ${dataset}.${params.tableName} |
89 | | -PARTITION BY date |
90 | | -CLUSTER BY metric, lens, client |
91 | | -AS |
92 | | -*/ |
93 | | -
|
94 | | ---/* Subsequent report run |
95 | | -DELETE FROM ${dataset}.${params.tableName} |
96 | | -WHERE date = '${params.date}' |
97 | | - AND metric = '${params.metric.id}' |
98 | | - AND lens = '${params.lens.name}'; |
99 | | -INSERT INTO ${dataset}.${params.tableName} |
100 | | ---*/ |
101 | | -
|
102 | | -SELECT |
103 | | - '${params.metric.id}' AS metric, |
104 | | - '${params.lens.name}' AS lens, |
105 | | - * |
106 | | -FROM ( |
107 | | - ${params.sql.query(ctx, params)} |
108 | | -); |
109 | | -
|
110 | | -SET job_config = TO_JSON( |
111 | | - STRUCT( |
112 | | - "cloud_storage" AS destination, |
113 | | - STRUCT( |
114 | | - "httparchive" AS bucket, |
115 | | - "${generateExportPath(params)}" AS name |
116 | | - ) AS config, |
117 | | - r"${generateExportQuery(params)}" AS query |
118 | | - ) |
119 | | -); |
120 | | -
|
121 | | -SELECT reports.run_export_job(job_config); |
122 | | - `) |
| 214 | + .queries(ctx => generateOperationSQL(ctx, reportConfig)) |
123 | 215 | }) |
0 commit comments