|
| 1 | +const pastMonth = constants.fnPastMonth(constants.currentMonth) |
| 2 | + |
| 3 | +publish('html_elements_popularity', { |
| 4 | + schema: 'reports', |
| 5 | + type: 'incremental', |
| 6 | + tags: ['crawl_complete'], |
| 7 | + description: 'Contact: https://github.com/bkardell' |
| 8 | +}).preOps(ctx => ` |
| 9 | +CREATE TEMPORARY FUNCTION getElements(payload STRING) |
| 10 | +RETURNS ARRAY<STRING> LANGUAGE js AS ''' |
| 11 | +try { |
| 12 | + var elements = JSON.parse(payload); |
| 13 | + if (Array.isArray(elements) || typeof elements != 'object') return []; |
| 14 | + return Object.keys(elements); |
| 15 | +} catch (e) { |
| 16 | + return []; |
| 17 | +} |
| 18 | +'''; |
| 19 | +
|
| 20 | +DELETE FROM ${ctx.self()} |
| 21 | +WHERE date = '${pastMonth}'; |
| 22 | +`).query(ctx => ` |
| 23 | +WITH pages_data AS ( |
| 24 | + SELECT |
| 25 | + date, |
| 26 | + client, |
| 27 | + root_page, |
| 28 | + page, |
| 29 | + custom_metrics.element_count |
| 30 | + FROM ${ctx.ref('crawl', 'pages')} |
| 31 | + WHERE |
| 32 | + date = '${pastMonth}' ${constants.devRankFilter} |
| 33 | +), |
| 34 | +
|
| 35 | +totals AS ( |
| 36 | + SELECT |
| 37 | + client, |
| 38 | + COUNT(DISTINCT root_page) AS total |
| 39 | + FROM pages_data |
| 40 | + GROUP BY client |
| 41 | +) |
| 42 | +
|
| 43 | +SELECT |
| 44 | + p.date, |
| 45 | + p.client, |
| 46 | + element, |
| 47 | + COUNT(DISTINCT p.root_page) AS pages, |
| 48 | + t.total, |
| 49 | + COUNT(DISTINCT p.root_page) / t.total AS pct, |
| 50 | + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT p.page LIMIT 5), ' ') AS sample_urls |
| 51 | +FROM pages_data p |
| 52 | +JOIN totals t |
| 53 | +ON p.client = t.client, |
| 54 | + UNNEST(getElements(TO_JSON_STRING(p.element_count))) AS element |
| 55 | +GROUP BY |
| 56 | + p.date, |
| 57 | + p.client, |
| 58 | + t.total, |
| 59 | + element |
| 60 | +HAVING |
| 61 | + COUNT(DISTINCT p.root_page) >= 10 |
| 62 | +ORDER BY |
| 63 | + pages / total DESC, |
| 64 | + client |
| 65 | +`).postOps(ctx => ` |
| 66 | +SELECT |
| 67 | + reports.run_export_job( |
| 68 | + JSON '''{ |
| 69 | + "destination": "cloud_storage", |
| 70 | + "config": { |
| 71 | + "bucket": "${constants.bucket}", |
| 72 | + "name": "${constants.storagePath}${pastMonth.replaceAll('-', '_')}/htmlElementPopularity.json" |
| 73 | + }, |
| 74 | + "query": "SELECT * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'" |
| 75 | + }''' |
| 76 | + ); |
| 77 | +`) |
0 commit comments