Skip to content

Commit a428173

Browse files
html_elements_popularity report (#140)
* report * fixes
1 parent ce50c15 commit a428173

1 file changed

Lines changed: 77 additions & 0 deletions

File tree

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
const pastMonth = constants.fnPastMonth(constants.currentMonth)
2+
3+
publish('html_elements_popularity', {
4+
schema: 'reports',
5+
type: 'incremental',
6+
tags: ['crawl_complete'],
7+
description: 'Contact: https://github.com/bkardell'
8+
}).preOps(ctx => `
9+
CREATE TEMPORARY FUNCTION getElements(payload STRING)
10+
RETURNS ARRAY<STRING> LANGUAGE js AS '''
11+
try {
12+
var elements = JSON.parse(payload);
13+
if (Array.isArray(elements) || typeof elements != 'object') return [];
14+
return Object.keys(elements);
15+
} catch (e) {
16+
return [];
17+
}
18+
''';
19+
20+
DELETE FROM ${ctx.self()}
21+
WHERE date = '${pastMonth}';
22+
`).query(ctx => `
23+
WITH pages_data AS (
24+
SELECT
25+
date,
26+
client,
27+
root_page,
28+
page,
29+
custom_metrics.element_count
30+
FROM ${ctx.ref('crawl', 'pages')}
31+
WHERE
32+
date = '${pastMonth}' ${constants.devRankFilter}
33+
),
34+
35+
totals AS (
36+
SELECT
37+
client,
38+
COUNT(DISTINCT root_page) AS total
39+
FROM pages_data
40+
GROUP BY client
41+
)
42+
43+
SELECT
44+
p.date,
45+
p.client,
46+
element,
47+
COUNT(DISTINCT p.root_page) AS pages,
48+
t.total,
49+
COUNT(DISTINCT p.root_page) / t.total AS pct,
50+
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT p.page LIMIT 5), ' ') AS sample_urls
51+
FROM pages_data p
52+
JOIN totals t
53+
ON p.client = t.client,
54+
UNNEST(getElements(TO_JSON_STRING(p.element_count))) AS element
55+
GROUP BY
56+
p.date,
57+
p.client,
58+
t.total,
59+
element
60+
HAVING
61+
COUNT(DISTINCT p.root_page) >= 10
62+
ORDER BY
63+
pages / total DESC,
64+
client
65+
`).postOps(ctx => `
66+
SELECT
67+
reports.run_export_job(
68+
JSON '''{
69+
"destination": "cloud_storage",
70+
"config": {
71+
"bucket": "${constants.bucket}",
72+
"name": "${constants.storagePath}${pastMonth.replaceAll('-', '_')}/htmlElementPopularity.json"
73+
},
74+
"query": "SELECT * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'"
75+
}'''
76+
);
77+
`)

0 commit comments

Comments
 (0)