Skip to content

Commit c26f79e

Browse files
committed
Merge remote-tracking branch 'origin/main' into standard_reports
2 parents 633895c + 43124f3 commit c26f79e

68 files changed

Lines changed: 3022 additions & 2437 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/dependabot.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
---
12
# To get started with Dependabot version updates, you'll need to specify which
23
# package ecosystems to update and where the package manifests are located.
34
# Please see the documentation for all configuration options:
@@ -10,6 +11,18 @@ updates:
1011
schedule:
1112
interval: "weekly"
1213
- package-ecosystem: "npm"
13-
directory: "/src"
14+
directory: "/infra/bigquery-export"
15+
schedule:
16+
interval: "weekly"
17+
- package-ecosystem: "npm"
18+
directory: "infra/dataform-export"
19+
schedule:
20+
interval: "weekly"
21+
- package-ecosystem: "npm"
22+
directory: "infra/dataform-trigger"
23+
schedule:
24+
interval: "weekly"
25+
- package-ecosystem: "terraform"
26+
directory: "infra/tf/"
1427
schedule:
1528
interval: "weekly"

.github/workflows/linter.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
---
12
name: Linter
23

34
on:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
node_modules/
22
.DS_Store
3+
.venv/
34

45
# Terraform
56
infra/tf/.terraform/

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,9 @@ tf_plan:
1111

1212
tf_apply:
1313
terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve
14+
15+
bigquery_export_deploy:
16+
cd infra/bigquery-export && npm run build
17+
18+
#bigquery_export_spark_deploy:
19+
# cd infra/bigquery_export_spark && gcloud builds submit --region=global --tag us-docker.pkg.dev/httparchive/bigquery-spark-procedures/firestore_export:latest

README.md

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Consumers:
4747

4848
### Triggering workflows
4949

50-
In order to unify the workflow triggering mechanism, we use [a Cloud Run function](./src/README.md) that can be invoked in a number of ways (e.g. listen to PubSub messages), do intermediate checks and trigger the particular Dataform workflow execution configuration.
50+
In order to unify the workflow triggering mechanism, we use [a Cloud Run function](./infra/README.md) that can be invoked in a number of ways (e.g. listen to PubSub messages), do intermediate checks and trigger the particular Dataform workflow execution configuration.
5151

5252
## Contributing
5353

@@ -59,5 +59,38 @@ In order to unify the workflow triggering mechanism, we use [a Cloud Run functio
5959

6060
#### Workspace hints
6161

62-
1. In `workflow_settings.yaml` set `env_name: dev` to process sampled data.
63-
2. In `includes/constants.js` set `today` or other variables to a custome value.
62+
1. In `workflow_settings.yaml` set `environment: dev` to process sampled data.
63+
2. For development and testing, you can modify variables in `includes/constants.js`, but note that these are programmatically generated.
64+
65+
## Repository Structure
66+
67+
- `definitions/` - Contains the core Dataform SQL definitions and declarations
68+
- `output/` - Contains the main pipeline transformation logic
69+
- `declarations/` - Contains referenced tables/views declarations and other resources definitions
70+
- `includes/` - Contains shared JavaScript utilities and constants
71+
- `infra/` - Infrastructure code and deployment configurations
72+
- `dataform-trigger/` - Cloud Run function for workflow automation
73+
- `tf/` - Terraform configurations
74+
- `bigquery-export/` - BigQuery export configurations
75+
- `docs/` - Additional documentation
76+
77+
## Development Setup
78+
79+
1. Install dependencies:
80+
81+
```bash
82+
npm install
83+
```
84+
85+
2. Available Scripts:
86+
87+
- `npm run format` - Format code using Standard.js, fix Markdown issues, and format Terraform files
88+
- `npm run lint` - Run linting checks on JavaScript, Markdown files, and compile Dataform configs
89+
90+
## Code Quality
91+
92+
This repository uses:
93+
94+
- Standard.js for JavaScript code style
95+
- Markdownlint for Markdown file formatting
96+
- Dataform's built-in compiler for SQL validation

definitions/declarations/chrome-ux-report.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ assert('country_summary_not_empty').query(ctx => `
1313
FROM ${ctx.ref(database, 'materialized', 'country_summary')}
1414
|> WHERE yyyymm = ${pastMonthYYYYMM}
1515
|> AGGREGATE COUNT(DISTINCT country_code) AS cnt_countries
16-
|> WHERE cnt_countries != 238
17-
|> SELECT "Table data doesn't match 238 countries" AS error_message
16+
|> WHERE cnt_countries < 236
17+
|> SELECT "Table data has less than 236 countries" AS error_message
1818
`)
1919

2020
declare({
@@ -25,7 +25,7 @@ declare({
2525

2626
assert('device_summary_not_empty').query(ctx => `
2727
FROM ${ctx.ref(database, 'materialized', 'device_summary')}
28-
|> WHERE date = ''${pastMonth}''
28+
|> WHERE date = '${pastMonth}'
2929
|> AGGREGATE COUNT(DISTINCT device) AS cnt_devices, COUNT(DISTINCT rank) AS cnt_ranks
3030
|> WHERE cnt_devices != 3 OR cnt_ranks != 10
3131
|> SELECT "Table data doesn't match 3 unique devices and 10 ranks" AS error_message

definitions/declarations/httparchive.js

Lines changed: 1 addition & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,7 @@
44
schema: 'crawl_staging',
55
name: table
66
})
7-
)
8-
9-
// See https://github.com/HTTPArchive/dataform/issues/43
10-
assert('corrupted_technology_values')
11-
.tags(['crawl_complete'])
12-
.query(ctx => `
13-
SELECT
14-
date,
15-
client,
16-
tech,
17-
COUNT(DISTINCT page) AS cnt_pages,
18-
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
19-
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
20-
LEFT JOIN pages.technologies AS tech
21-
LEFT JOIN tech.categories AS category
22-
WHERE
23-
date = '${constants.currentMonth}' AND
24-
(
25-
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
26-
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
27-
OR ARRAY_LENGTH(tech.categories) = 0
28-
)
29-
GROUP BY
30-
date,
31-
client,
32-
tech
33-
ORDER BY cnt_pages DESC
34-
`);
7+
);
358

369
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
3710
['technologies', 'categories'].forEach(table =>
@@ -40,21 +13,3 @@ ORDER BY cnt_pages DESC
4013
name: table
4114
})
4215
)
43-
44-
operate('create_reservation_assignment')
45-
.tags(['crawl_complete'])
46-
.queries(ctx => `
47-
CREATE ASSIGNMENT
48-
\`httparchive.region-us.retrospective-reprocessing.pipeline\`
49-
OPTIONS (
50-
assignee = 'projects/httparchive',
51-
job_type = 'QUERY')
52-
`)
53-
54-
operate('drop_reservation_assignment')
55-
.dependencies(['requests_10k'])
56-
.tags(['crawl_complete'])
57-
.queries(ctx => `
58-
DROP ASSIGNMENT IF EXISTS
59-
\`httparchive.region-us.retrospective-reprocessing.pipeline\`
60-
`)

definitions/output/blink_features/features.js

Lines changed: 0 additions & 37 deletions
This file was deleted.

definitions/output/blink_features/usage.js

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,36 @@ publish('usage', {
22
schema: 'blink_features',
33
type: 'incremental',
44
protected: true,
5+
bigquery: {
6+
partitionBy: 'date',
7+
clusterBy: ['client', 'rank', 'feature']
8+
},
9+
description: 'Used in https://lookerstudio.google.com/u/0/reporting/1M8kXOqPkwYNKjJhtag_nvDNJCpvmw_ri/page/tc5b, embedded in https://chromestatus.com/metrics/feature/timeline/popularity/2203',
510
tags: ['crawl_complete', 'blink_report']
611
}).preOps(ctx => `
712
DELETE FROM ${ctx.self()}
8-
WHERE yyyymmdd = REPLACE('${constants.currentMonth}', '-', '');
13+
WHERE date = '${constants.currentMonth}';
914
`).query(ctx => `
15+
WITH pages AS (
1016
SELECT
11-
REPLACE(CAST(date AS STRING), '-', '') AS yyyymmdd,
17+
date,
1218
client,
19+
rank,
20+
page,
21+
features
22+
FROM ${ctx.ref('crawl', 'pages')}
23+
WHERE
24+
date = '${constants.currentMonth}' AND
25+
is_root_page = TRUE
26+
${constants.devRankFilter}
27+
), ranks AS (
28+
SELECT DISTINCT rank FROM pages
29+
)
30+
31+
SELECT
32+
date,
33+
client,
34+
rank,
1335
id,
1436
feature,
1537
type,
@@ -19,20 +41,22 @@ SELECT
1941
sample_urls
2042
FROM (
2143
SELECT
22-
yyyymmdd AS date,
44+
date,
2345
client,
24-
id,
25-
feature,
26-
type,
27-
COUNT(DISTINCT url) AS num_urls,
28-
ARRAY_AGG(url ORDER BY rank, url LIMIT 100) AS sample_urls
29-
FROM ${ctx.ref('blink_features', 'features')}
30-
WHERE
31-
yyyymmdd = '${constants.currentMonth}'
32-
${constants.devRankFilter}
46+
ranks.rank,
47+
feature.id,
48+
feature.feature,
49+
feature.type,
50+
COUNT(DISTINCT page) AS num_urls,
51+
ARRAY_AGG(page ORDER BY pages.rank, page LIMIT 100) AS sample_urls
52+
FROM pages
53+
CROSS JOIN UNNEST(features) AS feature
54+
FULL OUTER JOIN ranks
55+
ON pages.rank <= ranks.rank
3356
GROUP BY
34-
yyyymmdd,
57+
date,
3558
client,
59+
ranks.rank,
3660
id,
3761
feature,
3862
type
@@ -41,15 +65,15 @@ JOIN (
4165
SELECT
4266
date,
4367
client,
68+
ranks.rank,
4469
COUNT(DISTINCT page) AS total_urls
45-
FROM ${ctx.ref('crawl', 'pages')}
46-
WHERE
47-
date = '${constants.currentMonth}' AND
48-
is_root_page = TRUE
49-
${constants.devRankFilter}
70+
FROM pages
71+
FULL OUTER JOIN ranks
72+
ON pages.rank <= ranks.rank
5073
GROUP BY
5174
date,
52-
client
75+
client,
76+
ranks.rank
5377
)
54-
USING (date, client)
78+
USING (date, client, rank)
5579
`)

definitions/output/core_web_vitals/technologies.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ publish('technologies', {
99
clusterBy: ['geo', 'app', 'rank', 'client'],
1010
requirePartitionFilter: true
1111
},
12-
tags: ['crux_ready', 'tech_report'],
12+
tags: ['crux_ready'],
1313
dependOnDependencyAssertions: true
1414
}).preOps(ctx => `
1515
DELETE FROM ${ctx.self()}

0 commit comments

Comments
 (0)