@@ -73,7 +73,7 @@ WHERE date = '${constants.currentMonth}' AND
7373 ${ constants . devRankFilter }
7474` ) . postOps ( ctx => `
7575CREATE TEMP TABLE technologies_cleaned AS (
76- WITH wappalyzer AS (
76+ WITH technologies AS (
7777 SELECT DISTINCT
7878 name AS technology,
7979 categories
@@ -101,13 +101,13 @@ CREATE TEMP TABLE technologies_cleaned AS (
101101 LEFT JOIN pages.categories AS category
102102 WHERE
103103 -- Technology is corrupted
104- technology NOT IN (SELECT DISTINCT technology FROM wappalyzer ) OR
104+ technology NOT IN (SELECT DISTINCT technology FROM technologies ) OR
105105 -- Technology's category is corrupted
106106 CONCAT(technology, category) NOT IN (
107107 SELECT DISTINCT
108108 CONCAT(technology, category)
109- FROM wappalyzer
110- LEFT JOIN wappalyzer .categories AS category
109+ FROM technologies
110+ INNER JOIN technologies .categories AS category
111111 )
112112 ),
113113
@@ -118,14 +118,14 @@ CREATE TEMP TABLE technologies_cleaned AS (
118118 page,
119119 ARRAY_AGG(STRUCT(
120120 pages.technology,
121- wappalyzer .categories,
121+ technologies .categories,
122122 pages.info
123123 )) AS technologies
124124 FROM pages
125125 INNER JOIN impacted_pages
126126 USING (client, page)
127- INNER JOIN wappalyzer
128- ON pages. technology = wappalyzer.technology
127+ INNER JOIN technologies
128+ USING ( technology)
129129 GROUP BY
130130 client,
131131 page
@@ -134,8 +134,10 @@ CREATE TEMP TABLE technologies_cleaned AS (
134134 SELECT
135135 client,
136136 page,
137- technologies
138- FROM reconstructed_technologies
137+ reconstructed_technologies.technologies
138+ FROM impacted_pages
139+ LEFT JOIN reconstructed_technologies
140+ USING(client,page)
139141);
140142
141143-- Update the crawl.pages table with the cleaned and restored technologies
0 commit comments