Crawl columns descriptions across all table variants (#213)

max-ostapenko · web-flow · commit 04e7e860e592 · 2025-12-21T19:34:34.000+01:00
* Refactor output definitions to use centralized column descriptions and create new latest and sample_data files

* Add descriptions for columns in the WebPageTest data structure

* lint
diff --git a/.github/linters/eslint.config.mjs b/.github/linters/eslint.config.mjs
@@ -22,7 +22,8 @@ export default [
         ctx: 'readonly',
         constants: 'readonly',
         reports: 'readonly',
-        reservations: 'readonly'
+        reservations: 'readonly',
+        descriptions: 'readonly'
       }
     },
     rules: {
diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js
@@ -1,3 +1,5 @@
+const columns = descriptions.columns.pages
+
 // See https://github.com/HTTPArchive/dataform/issues/43
 assert('corrupted_technology_values')
   .tags(['crawl_complete'])
@@ -53,46 +55,7 @@ publish('pages', {
     clusterBy: ['client', 'is_root_page', 'rank', 'page'],
     requirePartitionFilter: true
   },
-  columns: {
-    date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
-    client: 'Test environment: desktop or mobile',
-    page: 'The URL of the page being tested',
-    is_root_page: 'Whether the page is the root of the origin',
-    root_page: 'The URL of the root page being tested, the origin followed by /',
-    rank: 'Site popularity rank, from CrUX',
-    wptid: 'ID of the WebPageTest results',
-    payload: 'JSON-encoded WebPageTest results for the page',
-    summary: 'JSON-encoded summarization of the page-level data',
-    custom_metrics: {
-      description: 'Custom metrics from WebPageTest',
-      columns: {
-        a11y: 'JSON-encoded A11Y metrics',
-        cms: 'JSON-encoded CMS detection',
-        cookies: 'JSON-encoded cookie metrics',
-        css_variables: 'JSON-encoded CSS variable metrics',
-        ecommerce: 'JSON-encoded ecommerce metrics',
-        element_count: 'JSON-encoded element count metrics',
-        javascript: 'JSON-encoded JavaScript metrics',
-        markup: 'JSON-encoded markup metrics',
-        media: 'JSON-encoded media metrics',
-        origin_trials: 'JSON-encoded origin trial metrics',
-        performance: 'JSON-encoded performance metrics',
-        privacy: 'JSON-encoded privacy metrics',
-        responsive_images: 'JSON-encoded responsive image metrics',
-        robots_txt: 'JSON-encoded robots.txt metrics',
-        security: 'JSON-encoded security metrics',
-        structured_data: 'JSON-encoded structured data metrics',
-        third_parties: 'JSON-encoded third-party metrics',
-        well_known: 'JSON-encoded well-known metrics',
-        wpt_bodies: 'JSON-encoded WebPageTest bodies',
-        other: 'JSON-encoded other custom metrics'
-      }
-    },
-    lighthouse: 'JSON-encoded Lighthouse report',
-    features: 'Blink features detected at runtime (see https://chromestatus.com/features)',
-    technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)',
-    metadata: 'Additional metadata about the test'
-  },
+  columns: columns,
   tags: ['crawl_complete'],
   dependOnDependencyAssertions: true
 }).preOps(ctx => `
diff --git a/definitions/output/crawl/parsed_css.js b/definitions/output/crawl/parsed_css.js
@@ -1,3 +1,5 @@
+const columns = descriptions.columns.parsed_css
+
 publish('parsed_css', {
   type: 'incremental',
   protected: true,
@@ -7,16 +9,7 @@ publish('parsed_css', {
     clusterBy: ['client', 'is_root_page', 'rank', 'page'],
     requirePartitionFilter: true
   },
-  columns: {
-    date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
-    client: 'Test environment: desktop or mobile',
-    page: 'The URL of the page being tested',
-    is_root_page: 'Whether the page is the root of the origin.',
-    root_page: 'The URL of the root page being tested',
-    rank: 'Site popularity rank, from CrUX',
-    url: 'The URL of the request',
-    css: 'The parsed CSS, in JSON format'
-  },
+  columns: columns,
   tags: ['crawl_complete']
 }).preOps(ctx => `
 ${reservations.reservation_setter(ctx)}
diff --git a/definitions/output/crawl/requests.js b/definitions/output/crawl/requests.js
@@ -1,3 +1,5 @@
+const columns = descriptions.columns.requests
+
 publish('requests', {
   type: 'incremental',
   protected: true,
@@ -7,35 +9,7 @@ publish('requests', {
     clusterBy: ['client', 'is_root_page', 'type', 'rank'],
     requirePartitionFilter: true
   },
-  columns: {
-    date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
-    client: 'Test environment: desktop or mobile',
-    page: 'The URL of the page being tested',
-    is_root_page: 'Whether the page is the root of the origin.',
-    root_page: 'The URL of the root page being tested',
-    rank: 'Site popularity rank, from CrUX',
-    url: 'The URL of the request',
-    is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects',
-    type: 'Simplified description of the type of resource (script, html, css, text, other, etc)',
-    index: 'The sequential 0-based index of the request',
-    payload: 'JSON-encoded WebPageTest result data for this request',
-    summary: 'JSON-encoded summarization of request data',
-    request_headers: {
-      description: 'Request headers',
-      columns: {
-        name: 'Request header name',
-        value: 'Request header value'
-      }
-    },
-    response_headers: {
-      description: 'Response headers',
-      columns: {
-        name: 'Response header name',
-        value: 'Response header value'
-      }
-    },
-    response_body: 'Text-based response body'
-  },
+  columns: columns,
   tags: ['crawl_complete']
 }).preOps(ctx => `
 ${reservations.reservation_setter(ctx)}
diff --git a/definitions/output/latest/pages.js b/definitions/output/latest/pages.js
@@ -0,0 +1,29 @@
+const columns = descriptions.columns.pages
+
+publish('pages', {
+  type: 'view',
+  schema: 'latest',
+  columns: columns
+}).query(ctx => `
+SELECT
+  *
+FROM ${ctx.ref('crawl', 'pages')}
+WHERE
+  date = (
+    SELECT
+      PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
+    FROM
+      httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
+    WHERE
+      table_name = 'pages' AND
+      /* Only include actual dates in partition ids */
+      partition_id >= '20250101' AND
+      partition_id < '20990101' AND
+      /* Exclude future dates - shouldn't be any, but you never know! */
+      partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
+  ) AND
+  /* The following should help make this even faster since above query is a little complex */
+  /* We should never be more than 60 days old hopefully! */
+  date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
+  date <= CURRENT_DATE()
+`)
diff --git a/definitions/output/latest/parsed_css.js b/definitions/output/latest/parsed_css.js
@@ -0,0 +1,29 @@
+const columns = descriptions.columns.parsed_css
+
+publish('parsed_css', {
+  type: 'view',
+  schema: 'latest',
+  columns: columns,
+}).query(ctx => `
+SELECT
+  *
+FROM ${ctx.ref('crawl', 'parsed_css')}
+WHERE
+  date = (
+    SELECT
+      PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
+    FROM
+      httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
+    WHERE
+      table_name = 'parsed_css' AND
+      /* Only include actual dates in partition ids */
+      partition_id >= '20250101' AND
+      partition_id < '20990101' AND
+      /* Exclude future dates - shouldn't be any, but you never know! */
+      partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
+  ) AND
+  /* The following should help make this even faster since above query is a little complex */
+  /* We should never be more than 60 days old hopefully! */
+  date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
+  date <= CURRENT_DATE()
+`)
diff --git a/definitions/output/latest/requests.js b/definitions/output/latest/requests.js
@@ -0,0 +1,30 @@
+const columns = descriptions.columns.requests
+
+publish('requests', {
+  type: 'view',
+  schema: 'latest',
+  columns: columns,
+}).query(ctx => `
+SELECT
+  *
+FROM
+  ${ctx.ref('crawl', 'requests')}
+WHERE
+  date = (
+    SELECT
+      PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
+    FROM
+      httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
+    WHERE
+      table_name = 'requests' AND
+      /* Only include actual dates in partition ids */
+      partition_id >= '20250101' AND
+      partition_id < '20990101' AND
+      /* Exclude future dates - shouldn't be any, but you never know! */
+      partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
+  ) AND
+  /* The following should help make this even faster since above query is a little complex */
+  /* We should never be more than 60 days old hopefully! */
+  date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
+  date <= CURRENT_DATE()
+`)
diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js
@@ -1,10 +1,13 @@
+const columns = descriptions.columns.pages
+
 publish('pages_10k', {
   type: 'table',
   schema: 'sample_data',
   bigquery: {
     partitionBy: 'date',
     clusterBy: ['client', 'is_root_page', 'rank', 'page']
   },
+  columns: columns,
   tags: ['crawl_complete']
 }).query(ctx => `
 SELECT *
diff --git a/definitions/output/sample_data/parsed_css_10k.js b/definitions/output/sample_data/parsed_css_10k.js
@@ -1,10 +1,13 @@
+const columns = descriptions.columns.parsed_css
+
 publish('parsed_css_10k', {
   type: 'table',
   schema: 'sample_data',
   bigquery: {
     partitionBy: 'date',
     clusterBy: ['client', 'is_root_page', 'rank', 'page']
   },
+  columns: columns,
   tags: ['crawl_complete']
 }).query(ctx => `
 SELECT *
diff --git a/definitions/output/sample_data/requests_10k.js b/definitions/output/sample_data/requests_10k.js
@@ -1,10 +1,13 @@
+const columns = descriptions.columns.requests
+
 publish('requests_10k', {
   type: 'table',
   schema: 'sample_data',
   bigquery: {
     partitionBy: 'date',
     clusterBy: ['client', 'is_root_page', 'rank', 'type']
   },
+  columns: columns,
   tags: ['crawl_complete']
 }).query(ctx => `
 SELECT *
diff --git a/includes/descriptions.js b/includes/descriptions.js
@@ -0,0 +1,86 @@
+
+const columns = {
+  pages: {
+    date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
+    client: 'Test environment: desktop or mobile',
+    page: 'The URL of the page being tested',
+    is_root_page: 'Whether the page is the root of the origin',
+    root_page: 'The URL of the root page being tested, the origin followed by /',
+    rank: 'Site popularity rank, from CrUX',
+    wptid: 'ID of the WebPageTest results',
+    payload: 'JSON-encoded WebPageTest results for the page',
+    summary: 'JSON-encoded summarization of the page-level data',
+    custom_metrics: {
+      description: 'Custom metrics from WebPageTest',
+      columns: {
+        a11y: 'JSON-encoded A11Y metrics',
+        cms: 'JSON-encoded CMS detection',
+        cookies: 'JSON-encoded cookie metrics',
+        css_variables: 'JSON-encoded CSS variable metrics',
+        ecommerce: 'JSON-encoded ecommerce metrics',
+        element_count: 'JSON-encoded element count metrics',
+        javascript: 'JSON-encoded JavaScript metrics',
+        markup: 'JSON-encoded markup metrics',
+        media: 'JSON-encoded media metrics',
+        origin_trials: 'JSON-encoded origin trial metrics',
+        performance: 'JSON-encoded performance metrics',
+        privacy: 'JSON-encoded privacy metrics',
+        responsive_images: 'JSON-encoded responsive image metrics',
+        robots_txt: 'JSON-encoded robots.txt metrics',
+        security: 'JSON-encoded security metrics',
+        structured_data: 'JSON-encoded structured data metrics',
+        third_parties: 'JSON-encoded third-party metrics',
+        well_known: 'JSON-encoded well-known metrics',
+        wpt_bodies: 'JSON-encoded WebPageTest bodies',
+        other: 'JSON-encoded other custom metrics'
+      }
+    },
+    lighthouse: 'JSON-encoded Lighthouse report',
+    features: 'Blink features detected at runtime (see https://chromestatus.com/features)',
+    technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)',
+    metadata: 'Additional metadata about the test'
+  },
+  requests: {
+    date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
+    client: 'Test environment: desktop or mobile',
+    page: 'The URL of the page being tested',
+    is_root_page: 'Whether the page is the root of the origin.',
+    root_page: 'The URL of the root page being tested',
+    rank: 'Site popularity rank, from CrUX',
+    url: 'The URL of the request',
+    is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects',
+    type: 'Simplified description of the type of resource (script, html, css, text, other, etc)',
+    index: 'The sequential 0-based index of the request',
+    payload: 'JSON-encoded WebPageTest result data for this request',
+    summary: 'JSON-encoded summarization of request data',
+    request_headers: {
+      description: 'Request headers',
+      columns: {
+        name: 'Request header name',
+        value: 'Request header value'
+      }
+    },
+    response_headers: {
+      description: 'Response headers',
+      columns: {
+        name: 'Response header name',
+        value: 'Response header value'
+      }
+    },
+    response_body: 'Text-based response body'
+  },
+  parsed_css: {
+    date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
+    client: 'Test environment: desktop or mobile',
+    page: 'The URL of the page being tested',
+    is_root_page: 'Whether the page is the root of the origin.',
+    root_page: 'The URL of the root page being tested',
+    rank: 'Site popularity rank, from CrUX',
+    url: 'The URL of the request',
+    css: 'The parsed CSS, in JSON format'
+  }
+}
+
+module.exports = {
+  columns
+}