@@ -52,23 +52,97 @@ publish('pages', {
5252DELETE FROM ${ ctx . self ( ) }
5353WHERE date = '${ constants . currentMonth } ' AND
5454 client = 'desktop';
55- ` ) . query ( ctx => `
55+
56+ INSERT INTO ${ ctx . self ( ) }
5657SELECT
5758 *
5859FROM ${ ctx . ref ( 'crawl_staging' , 'pages' ) }
5960WHERE date = '${ constants . currentMonth } ' AND
6061 client = 'desktop'
61- ${ constants . devRankFilter }
62- ` ) . postOps ( ctx => `
62+ ${ constants . devRankFilter } ;
63+
6364DELETE FROM ${ ctx . self ( ) }
6465WHERE date = '${ constants . currentMonth } ' AND
6566 client = 'mobile';
66-
67- INSERT INTO ${ ctx . self ( ) }
67+ ` ) . query ( ctx => `
6868SELECT
6969 *
7070FROM ${ ctx . ref ( 'crawl_staging' , 'pages' ) }
7171WHERE date = '${ constants . currentMonth } ' AND
7272 client = 'mobile'
7373 ${ constants . devRankFilter }
74+ ` ) . postOps ( ctx => `
75+ CREATE TEMP TABLE technologies_cleaned AS (
76+ WITH wappalyzer AS (
77+ SELECT DISTINCT
78+ name AS technology,
79+ categories
80+ FROM ${ ctx . ref ( 'wappalyzer' , 'technologies' ) }
81+ ),
82+
83+ pages AS (
84+ SELECT
85+ client,
86+ page,
87+ tech.technology,
88+ tech.categories,
89+ tech.info
90+ FROM ${ ctx . self ( ) } AS pages
91+ LEFT JOIN pages.technologies AS tech
92+ WHERE date = '${ constants . currentMonth } ' ${ constants . devRankFilter }
93+ ),
94+
95+ -- Identify impacted pages
96+ impacted_pages AS (
97+ SELECT DISTINCT
98+ client,
99+ page
100+ FROM pages
101+ LEFT JOIN pages.categories AS category
102+ WHERE
103+ -- Technology is corrupted
104+ technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR
105+ -- Technology's category is corrupted
106+ CONCAT(technology, category) NOT IN (
107+ SELECT DISTINCT
108+ CONCAT(technology, category)
109+ FROM wappalyzer
110+ LEFT JOIN wappalyzer.categories AS category
111+ )
112+ ),
113+
114+ -- Keep valid technologies and use correct categories
115+ reconstructed_technologies AS (
116+ SELECT
117+ client,
118+ page,
119+ ARRAY_AGG(STRUCT(
120+ pages.technology,
121+ wappalyzer.categories,
122+ pages.info
123+ )) AS technologies
124+ FROM pages
125+ INNER JOIN impacted_pages
126+ USING (client, page)
127+ INNER JOIN wappalyzer
128+ ON pages.technology = wappalyzer.technology
129+ GROUP BY
130+ client,
131+ page
132+ )
133+
134+ SELECT
135+ client,
136+ page,
137+ technologies
138+ FROM reconstructed_technologies
139+ );
140+
141+ -- Update the crawl.pages table with the cleaned and restored technologies
142+ UPDATE ${ ctx . self ( ) } AS pages
143+ SET technologies = technologies_cleaned.technologies
144+ FROM technologies_cleaned
145+ WHERE pages.date = '${ constants . currentMonth } ' AND
146+ pages.client = technologies_cleaned.client AND
147+ pages.page = technologies_cleaned.page;
74148` )
0 commit comments