|
1 | | -const configs = new reports.HTTPArchiveReports() |
2 | | -const metrics = configs.listMetrics() |
| 1 | +/** |
| 2 | + * Dynamic Reports Generator |
| 3 | + * |
| 4 | + * This file automatically generates Dataform operations for HTTP Archive reports. |
| 5 | + * It creates operations for each combination of: |
| 6 | + * - Date range (from startDate to endDate) |
| 7 | + * - Metrics (defined in includes/reports.js) |
| 8 | + * - SQL types (histogram, timeseries) |
| 9 | + * - Lenses (data filters like all, top1k, wordpress, etc.) |
| 10 | + * |
| 11 | + * Each operation: |
| 12 | + * 1. Calculates metrics from crawl data |
| 13 | + * 2. Stores results in BigQuery tables |
| 14 | + * 3. Exports data to Cloud Storage as JSON |
| 15 | + */ |
3 | 16 |
|
4 | | -const bucket = 'httparchive' |
5 | | -const storagePath = '/reports/dev/' |
| 17 | +// Initialize configurations |
| 18 | +const httpArchiveReports = new reports.HTTPArchiveReports() |
| 19 | +const availableMetrics = httpArchiveReports.listMetrics() |
| 20 | +const availableLenses = httpArchiveReports.lenses |
| 21 | + |
| 22 | +// Configuration constants |
| 23 | +const EXPORT_CONFIG = { |
| 24 | + bucket: constants.bucket, |
| 25 | + storagePath: constants.storagePath, |
| 26 | + dataset: 'reports', |
| 27 | + testSuffix: '.json' |
| 28 | +} |
| 29 | + |
| 30 | +// Date range for report generation |
| 31 | +// Adjust these dates to update reports retrospectively |
| 32 | +const DATE_RANGE = { |
| 33 | + startDate: constants.currentMonth, // '2025-07-01' |
| 34 | + endDate: constants.currentMonth // '2025-07-01' |
| 35 | +} |
| 36 | + |
| 37 | +/** |
| 38 | + * Generates the Cloud Storage export path for a report |
| 39 | + * @param {Object} reportConfig - Report configuration object |
| 40 | + * @returns {string} - Cloud Storage object path |
| 41 | + */ |
| 42 | +function buildExportPath(reportConfig) { |
| 43 | + const { sql, date, metric } = reportConfig |
| 44 | + let objectPath = EXPORT_CONFIG.storagePath |
6 | 45 |
|
7 | | -function generateExportQuery (metric, sql, params, ctx) { |
8 | | - let query = '' |
9 | 46 | if (sql.type === 'histogram') { |
10 | | - query = ` |
11 | | -SELECT |
12 | | - * EXCEPT(date) |
13 | | -FROM ${ctx.self()} |
14 | | -WHERE date = '${params.date}' |
15 | | -` |
| 47 | + // Histogram exports are organized by date folders |
| 48 | + const dateFolder = date.replaceAll('-', '_') |
| 49 | + objectPath += `${dateFolder}/${metric.id}` |
16 | 50 | } else if (sql.type === 'timeseries') { |
17 | | - query = ` |
18 | | -SELECT |
19 | | - FORMAT_DATE('%Y_%m_%d', date) AS date, |
20 | | - * EXCEPT(date) |
21 | | -FROM ${ctx.self()} |
22 | | -` |
| 51 | + // Timeseries exports are organized by metric |
| 52 | + objectPath += metric.id |
23 | 53 | } else { |
24 | | - throw new Error('Unknown SQL type') |
| 54 | + throw new Error(`Unknown SQL type: ${sql.type}`) |
25 | 55 | } |
26 | 56 |
|
27 | | - const queryOutput = query.replace(/[\r\n]+/g, ' ') |
28 | | - return queryOutput |
| 57 | + return objectPath + EXPORT_CONFIG.testSuffix |
29 | 58 | } |
30 | 59 |
|
31 | | -function generateExportPath (metric, sql, params) { |
| 60 | +/** |
| 61 | + * Generates the BigQuery export query for a report |
| 62 | + * @param {Object} reportConfig - Report configuration object |
| 63 | + * @returns {string} - SQL query for exporting data |
| 64 | + */ |
| 65 | +function buildExportQuery(reportConfig) { |
| 66 | + const { sql, date, metric, lens, tableName } = reportConfig |
| 67 | + |
| 68 | + let query |
32 | 69 | if (sql.type === 'histogram') { |
33 | | - return `${storagePath}${params.date.replaceAll('-', '_')}/${metric.id}.json` |
| 70 | + query = ` |
| 71 | + SELECT |
| 72 | + * EXCEPT(date, metric, lens) |
| 73 | + FROM \`${EXPORT_CONFIG.dataset}.${tableName}\` |
| 74 | + WHERE date = '${date}' |
| 75 | + AND metric = '${metric.id}' |
| 76 | + AND lens = '${lens.name}' |
| 77 | + ORDER BY bin ASC |
| 78 | + ` |
34 | 79 | } else if (sql.type === 'timeseries') { |
35 | | - return `${storagePath}${metric.id}.json` |
| 80 | + query = ` |
| 81 | + SELECT |
| 82 | + FORMAT_DATE('%Y_%m_%d', date) AS date, |
| 83 | + * EXCEPT(date, metric, lens) |
| 84 | + FROM \`${EXPORT_CONFIG.dataset}.${tableName}\` |
| 85 | + WHERE metric = '${metric.id}' |
| 86 | + AND lens = '${lens.name}' |
| 87 | + ORDER BY date DESC |
| 88 | + ` |
36 | 89 | } else { |
37 | | - throw new Error('Unknown SQL type') |
| 90 | + throw new Error(`Unknown SQL type: ${sql.type}`) |
38 | 91 | } |
| 92 | + |
| 93 | + // Convert to single line for JSON embedding |
| 94 | + return query.replace(/[\r\n]+/g, ' ').trim() |
39 | 95 | } |
40 | 96 |
|
41 | | -const iterations = [] |
42 | | -for ( |
43 | | - let date = constants.currentMonth; date >= constants.currentMonth; date = constants.fnPastMonth(date)) { |
44 | | - iterations.push({ |
| 97 | +/** |
| 98 | + * Creates a report configuration object |
| 99 | + * @param {string} date - Report date (YYYY-MM-DD) |
| 100 | + * @param {Object} metric - Metric configuration |
| 101 | + * @param {Object} sql - SQL configuration (type and query) |
| 102 | + * @param {string} lensName - Lens name |
| 103 | + * @param {string} lensSQL - Lens SQL filter |
| 104 | + * @returns {Object} - Complete report configuration |
| 105 | + */ |
| 106 | +function createReportConfig(date, metric, sql, lensName, lensSQL) { |
| 107 | + return { |
45 | 108 | date, |
46 | | - devRankFilter: constants.devRankFilter |
47 | | - }) |
| 109 | + metric, |
| 110 | + sql, |
| 111 | + lens: { name: lensName, sql: lensSQL }, |
| 112 | + devRankFilter: constants.devRankFilter, |
| 113 | + tableName: `${metric.id}_${sql.type}` |
| 114 | + } |
48 | 115 | } |
49 | 116 |
|
50 | | -if (iterations.length === 1) { |
51 | | - const params = iterations[0] |
52 | | - metrics.forEach(metric => { |
53 | | - metric.SQL.forEach(sql => { |
54 | | - publish(metric.id + '_' + sql.type, { |
55 | | - type: 'incremental', |
56 | | - protected: true, |
57 | | - bigquery: sql.type === 'histogram' ? { partitionBy: 'date', clusterBy: ['client'] } : {}, |
58 | | - schema: 'reports' |
59 | | - // tags: ['crawl_complete', 'http_reports'] |
60 | | - }).preOps(ctx => ` |
61 | | ---DELETE FROM ${ctx.self()} |
62 | | ---WHERE date = '${params.date}'; |
63 | | - `).query( |
64 | | - ctx => sql.query(ctx, params) |
65 | | - ).postOps(ctx => ` |
66 | | -SELECT |
67 | | - reports.run_export_job( |
68 | | - JSON '''{ |
69 | | - "destination": "cloud_storage", |
70 | | - "config": { |
71 | | - "bucket": "${bucket}", |
72 | | - "name": "${generateExportPath(metric, sql, params)}" |
73 | | - }, |
74 | | - "query": "${generateExportQuery(metric, sql, params, ctx)}" |
75 | | - }''' |
76 | | - ); |
77 | | - `) |
78 | | - }) |
79 | | - }) |
80 | | -} else { |
81 | | - iterations.forEach(params => { |
82 | | - metrics.forEach(metric => { |
| 117 | +/** |
| 118 | + * Generates all report configurations for the specified date range |
| 119 | + * @returns {Array} - Array of report configuration objects |
| 120 | + */ |
| 121 | +function generateReportConfigurations() { |
| 122 | + const reportConfigs = [] |
| 123 | + |
| 124 | + // Generate configurations for each date in range |
| 125 | + for (let date = DATE_RANGE.endDate; |
| 126 | + date >= DATE_RANGE.startDate; |
| 127 | + date = constants.fnPastMonth(date)) { |
| 128 | + |
| 129 | + // For each available metric |
| 130 | + availableMetrics.forEach(metric => { |
| 131 | + // For each SQL type (histogram, timeseries) |
83 | 132 | metric.SQL.forEach(sql => { |
84 | | - operate(metric.id + '_' + sql.type + '_' + params.date, { |
85 | | - // tags: ['crawl_complete', 'http_reports'] |
86 | | - }).queries(ctx => ` |
87 | | -DELETE FROM reports.${metric.id}_${sql.type} |
88 | | -WHERE date = '${params.date}'; |
89 | | -
|
90 | | -INSERT INTO reports.${metric.id}_${sql.type}` + sql.query(ctx, params) |
91 | | - ).postOps(ctx => ` |
92 | | - SELECT |
93 | | - reports.run_export_job( |
94 | | - JSON '''{ |
95 | | - "destination": "cloud_storage", |
96 | | - "config": { |
97 | | - "bucket": "${bucket}", |
98 | | - "name": "${generateExportPath(metric, sql, params)}" |
99 | | - }, |
100 | | - "query": "${generateExportQuery(metric, sql, params, ctx)}" |
101 | | - }''' |
102 | | - ); |
103 | | - `) |
| 133 | + // For each available lens (all, top1k, wordpress, etc.) |
| 134 | + Object.entries(availableLenses).forEach(([lensName, lensSQL]) => { |
| 135 | + const config = createReportConfig(date, metric, sql, lensName, lensSQL) |
| 136 | + reportConfigs.push(config) |
| 137 | + }) |
104 | 138 | }) |
105 | 139 | }) |
106 | | - }) |
| 140 | + } |
| 141 | + |
| 142 | + return reportConfigs |
| 143 | +} |
| 144 | + |
| 145 | +/** |
| 146 | + * Creates a Dataform operation name for a report configuration |
| 147 | + * @param {Object} reportConfig - Report configuration object |
| 148 | + * @returns {string} - Operation name |
| 149 | + */ |
| 150 | +function createOperationName(reportConfig) { |
| 151 | + const { tableName, date, lens } = reportConfig |
| 152 | + return `${tableName}_${date}_${lens.name}` |
| 153 | +} |
| 154 | + |
| 155 | +/** |
| 156 | + * Generates the SQL for a Dataform operation |
| 157 | + * @param {Object} ctx - Dataform context |
| 158 | + * @param {Object} reportConfig - Report configuration object |
| 159 | + * @returns {string} - Complete SQL for the operation |
| 160 | + */ |
| 161 | +function generateOperationSQL(ctx, reportConfig) { |
| 162 | + const { date, metric, lens, sql, tableName } = reportConfig |
| 163 | + |
| 164 | + return ` |
| 165 | +DECLARE job_config JSON; |
| 166 | +
|
| 167 | +/* First report run - uncomment to create table |
| 168 | +CREATE TABLE IF NOT EXISTS ${EXPORT_CONFIG.dataset}.${tableName} |
| 169 | +PARTITION BY date |
| 170 | +CLUSTER BY metric, lens, client |
| 171 | +AS |
| 172 | +*/ |
| 173 | +
|
| 174 | +--/* Subsequent report run |
| 175 | +DELETE FROM ${EXPORT_CONFIG.dataset}.${tableName} |
| 176 | +WHERE date = '${date}' |
| 177 | + AND metric = '${metric.id}' |
| 178 | + AND lens = '${lens.name}'; |
| 179 | +INSERT INTO ${EXPORT_CONFIG.dataset}.${tableName} |
| 180 | +--*/ |
| 181 | +
|
| 182 | +SELECT |
| 183 | + '${metric.id}' AS metric, |
| 184 | + '${lens.name}' AS lens, |
| 185 | + * |
| 186 | +FROM ( |
| 187 | + ${sql.query(ctx, reportConfig)} |
| 188 | +); |
| 189 | +
|
| 190 | +SET job_config = TO_JSON( |
| 191 | + STRUCT( |
| 192 | + "cloud_storage" AS destination, |
| 193 | + STRUCT( |
| 194 | + "httparchive" AS bucket, |
| 195 | + "${buildExportPath(reportConfig)}" AS name |
| 196 | + ) AS config, |
| 197 | + r"${buildExportQuery(reportConfig)}" AS query |
| 198 | + ) |
| 199 | +); |
| 200 | +
|
| 201 | +SELECT reports.run_export_job(job_config); |
| 202 | +` |
107 | 203 | } |
| 204 | + |
| 205 | +// Generate all report configurations |
| 206 | +const reportConfigurations = generateReportConfigurations() |
| 207 | + |
| 208 | +// Create Dataform operations for each report configuration |
| 209 | +reportConfigurations.forEach(reportConfig => { |
| 210 | + const operationName = createOperationName(reportConfig) |
| 211 | + |
| 212 | + operate(operationName) |
| 213 | + .tags(['crawl_complete', 'crawl_reports']) |
| 214 | + .queries(ctx => generateOperationSQL(ctx, reportConfig)) |
| 215 | +}) |
0 commit comments