1515########################
1616### GLOBAL VARIABLES ###
1717########################
18- globalgameslist = utils .gimme_global_games_list () # slug in entries folder
19- logger = Logger (utils .PREFERRED_OUTPUT ) # logger will print in file or on console depending on params in utils.PREFERRED_OUTPUT --> LOG or CONSOLE
18+ globalgameslist = utils .gimme_global_games_list () # slug in entries folder
19+ logger = Logger (
20+ utils .PREFERRED_OUTPUT
21+ ) # logger will print in file or on console depending on params in utils.PREFERRED_OUTPUT --> LOG or CONSOLE
2022
2123baseurl = "https://demozoo.org"
2224blacklist = [
23- #"missing-colors", # file in a folder...must solve this ASAP
24- "pdroms-com-relaunch" # duplicate file (and it doesn't have devs specified)
25+ # "missing-colors", # file in a folder...must solve this ASAP
26+ "pdroms-com-relaunch" # duplicate file (and it doesn't have devs specified)
2527]
2628
2729#############
2830### DEBUG ###
2931#############
30- added = [] # debug
31- #as a friendly reminder, remember to change utils.DEBUG flag!
32+ added = [] # debug
33+ # as a friendly reminder, remember to change utils.DEBUG flag!
3234
3335#################
3436### CONSTANTS ###
3537#################
3638
37- #TODO: GBA placeholder intentionally left here for future development.
39+ # TODO: GBA placeholder intentionally left here for future development.
3840##
39- # dict containing demozoo's categories,
40- # with a mapped "simplified" category according to CONTRIBUTING.MD
41- # "game", "homebrew", "demo" or "hackrom"
41+ # dict containing demozoo's categories,
42+ # with a mapped "simplified" category according to CONTRIBUTING.MD
43+ # "game", "homebrew", "demo" or "hackrom"
4244##
4345PLATFORMS = {
4446 "Gameboy" : [38 , "GB" ],
5052# Default: "../../entries
5153entrypath = "py_common/" + utils .BETA_FOLDER + "/" if utils .DEBUG else "../../entries"
5254
55+
5356#################
5457### FUNCTIONS ###
5558#################
5659def scrape (platform ):
57- '''
58- scrape Demozoo prods page and fetches all links
59- - each link will be processed (scraped) and a Production object will be built
60- - this object will be used to build JSON, files and folders
61- '''
60+ """
61+ scrape Demozoo prods page and fetches all links
62+ - each link will be processed (scraped) and a Production object will be built
63+ - this object will be used to build JSON, files and folders
64+ """
6265 logger .write ("[INFO]" , "Scraping platform " + platform )
63- page = requests .get (baseurl + "/productions/?platform=" + str (PLATFORMS [platform ][0 ]) + "&page=1" , timeout = None )
64- soup = BeautifulSoup (page .content , 'html.parser' )
66+ page = requests .get (
67+ baseurl + "/productions/?platform=" + str (PLATFORMS [platform ][0 ]) + "&page=1" ,
68+ timeout = None ,
69+ )
70+ soup = BeautifulSoup (page .content , "html.parser" )
6571
6672 # parsing every page
6773 enough_page = True
6874 i = 0
6975 while enough_page :
70- if soup .find ('a' , {"title" : "Next_page" }):
76+ if soup .find ("a" , {"title" : "Next_page" }):
7177 enough_page = True
7278 else :
7379 enough_page = False
7480
75- logger .write ("[INFO]" , "Parsing page: " + str (i + 1 ) )
76- #TODO: dont call twice this page, as it is called before
77-
78- page = requests .get (baseurl + "/productions/?platform=" + str (PLATFORMS [platform ][0 ]) + "&page=" + str (i + 1 ), timeout = None )
79- soup = BeautifulSoup (page .content , 'html.parser' )
81+ logger .write ("[INFO]" , "Parsing page: " + str (i + 1 ))
82+ # TODO: dont call twice this page, as it is called before
83+
84+ page = requests .get (
85+ baseurl
86+ + "/productions/?platform="
87+ + str (PLATFORMS [platform ][0 ])
88+ + "&page="
89+ + str (i + 1 ),
90+ timeout = None ,
91+ )
92+ soup = BeautifulSoup (page .content , "html.parser" )
8093
8194 # get the big prods table
82- prodTable = soup .findAll (' tbody' )[0 ].findAll ('a' )
95+ prodTable = soup .findAll (" tbody" )[0 ].findAll ("a" )
8396
8497 # get links "worth to parse" (those ones that links to a production page)
85- links = [ link for link in prodTable if "productions" in link .get ("href" ) ]
98+ links = [link for link in prodTable if "productions" in link .get ("href" )]
8699
87100 # get rows; for each rows, get the name of the prod and the internal link
88101 for link in links :
@@ -94,19 +107,21 @@ def scrape(platform):
94107 if slug not in globalgameslist and slug not in blacklist :
95108 # scrape demozoo's page: the returned object will be used to build the file hierarchy
96109 prod = scrape_page (slug , demozoo_internal_link , PLATFORMS [platform ][1 ])
97-
110+
98111 if prod != - 1 :
99- #DBGPRINT slugprint
100- #print(prod.slug)
112+ # DBGPRINT slugprint
113+ # print(prod.slug)
101114
102115 # check if it could be added to database or not
103116 # building files
104- ret = utils .build (prod , entrypath , ["gb" , "gbc" ]) # TODO: GBA, add GBA to this list
105-
117+ ret = utils .build (
118+ prod , entrypath , ["gb" , "gbc" ]
119+ ) # TODO: GBA, add GBA to this list
120+
106121 # make required JSON file
107122 if ret != 1 :
108123 ret = utils .makeJSON (prod , entrypath )
109-
124+
110125 # useful to print all added entries (to spot duplicates for example)
111126 if utils .DEBUG :
112127 added .append (prod .slug )
@@ -116,59 +131,89 @@ def scrape(platform):
116131 elif slug in globalgameslist :
117132 logger .write ("[WARN]" , " " + slug + " already in entries folder!" )
118133
134+
119135def parse_date (date_string ):
120- date_part = re .search (r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})" , date_string )
121-
136+ date_string = date_string .replace ("Released " , "" )
137+
138+ date_part = re .search (
139+ r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})" , date_string
140+ )
141+
122142 if not date_part :
123143 raise ValueError (f"No recognizable date found in: { date_string } " )
124-
144+
125145 date_part = date_part .group (0 ) # Extract the matched part
126-
127- parsed_date = datetime .strptime (date_part , "%d %B %Y" )
128146
129- # Convert to desired format
130- return parsed_date .strftime ("%Y-%m-%d" )
147+ # Determine the format based on the matched part
148+ try :
149+ if re .match (
150+ r"\d{1,2} [A-Za-z]+ \d{4}" , date_part
151+ ): # Full date like "1 January 2024"
152+ parsed_date = datetime .strptime (date_part , "%d %B %Y" )
153+ return parsed_date .strftime ("%Y-%m-%d" )
154+ elif re .match (r"[A-Za-z]+ \d{4}" , date_part ): # Month and year like "June 2009"
155+ parsed_date = datetime .strptime (date_part , "%B %Y" )
156+ return parsed_date .strftime ("%Y-%m" )
157+ elif re .match (r"\d{4}" , date_part ): # Year only like "2009"
158+ parsed_date = datetime .strptime (date_part , "%Y" )
159+ return parsed_date .strftime ("%Y" )
160+ except ValueError as e :
161+ raise ValueError (f"Error parsing date: { e } " )
162+
131163
132164def scrape_page (slug , url , platform ):
133165 demozoo_url = url
134- '''
166+ """
135167 given a slug and demozoo production url, it returns an object containing everything useful
136168 to build a file hierarchy
137- '''
169+ """
138170 # init variables
139171 screenshots = []
140172 files = []
141173 typetag = ""
142174
143175 page = requests .get (url , timeout = None )
144- soup = BeautifulSoup (page .content , ' html.parser' )
176+ soup = BeautifulSoup (page .content , " html.parser" )
145177
146178 # getting title
147- title = str .strip (soup .find ('div' , {"class" : "production_title focus_title" }).findChildren ("h2" )[0 ].text )
179+ title = str .strip (
180+ soup .find ("div" , {"class" : "production_title focus_title" })
181+ .findChildren ("h2" )[0 ]
182+ .text
183+ )
148184
149- date_string = str .strip (soup .find ('ul' , {"class" : "attributes" }).findChildren ("li" )[0 ].text )
185+ date_string = str .strip (
186+ soup .find ("ul" , {"class" : "attributes" }).findChildren ("li" )[0 ].text
187+ )
150188
151189 release_date = None
152190
153191 try :
154192 release_date = parse_date (date_string )
155193 print (date_string , "->" , parse_date (date_string ))
156194 except :
157- print ("nodate" )
158-
195+ print ("COULDN'T PARSE DATE:" , date_string )
159196
160197 logger .write ("[INFO]" , " Adding: " + title + " ..." )
161198
162199 # getting developer
163- developer = str .strip (soup .find ('div' , {"class" : "production_title focus_title" }).findChildren ("h3" )[0 ].findChildren ("a" )[0 ].text )
164-
200+ developer = str .strip (
201+ soup .find ("div" , {"class" : "production_title focus_title" })
202+ .findChildren ("h3" )[0 ]
203+ .findChildren ("a" )[0 ]
204+ .text
205+ )
206+
165207 # fetching tag
166- list_typetag = soup .find ('li' , {"class" : "signpost" })
208+ list_typetag = soup .find ("li" , {"class" : "signpost" })
167209 if list_typetag == None :
168210 typetag = ""
169211 else :
170- typetag = str .strip (list_typetag .text if not isinstance (list_typetag , list ) else list_typetag [0 ].text )
171-
212+ typetag = str .strip (
213+ list_typetag .text
214+ if not isinstance (list_typetag , list )
215+ else list_typetag [0 ].text
216+ )
172217
173218 if "TRO" in typetag .upper () or "DEMO" in typetag .upper ():
174219 typetag = "demo"
@@ -181,9 +226,9 @@ def scrape_page(slug, url, platform):
181226 else :
182227 logger .write ("[WARN]" , " We don't care about this category: " + typetag )
183228 return - 1
184-
229+
185230 # fetching screenshot
186- screen_obj = soup .find ('a' , {"class" : "screenshot" })
231+ screen_obj = soup .find ("a" , {"class" : "screenshot" })
187232 if screen_obj is not None :
188233 screenshot = screen_obj .get ("href" )
189234 else :
@@ -196,7 +241,7 @@ def scrape_page(slug, url, platform):
196241 source = source .get ("href" ) if source else ""
197242
198243 # fetching url (if present)
199- url = soup .find ('ul' , {"class" : "download_links" })
244+ url = soup .find ("ul" , {"class" : "download_links" })
200245 if url is not None :
201246 url = url .findChildren ("a" )
202247 else :
@@ -210,7 +255,10 @@ def scrape_page(slug, url, platform):
210255 elif len (url ) == 1 :
211256 url = url [0 ].get ("href" )
212257 if "modermodemet.se" in url :
213- logger .write ("[ERR]" , " modermodemet.se is not available, and no other valid link has been found" )
258+ logger .write (
259+ "[ERR]" ,
260+ " modermodemet.se is not available, and no other valid link has been found" ,
261+ )
214262 return - 1
215263 elif len (url ) >= 2 :
216264 # because almost always the prod will have the secondary mirror as scene.org or smth like that
@@ -221,19 +269,33 @@ def scrape_page(slug, url, platform):
221269 # fetching video
222270 video = soup .find (lambda tag : tag .name == "a" and "youtube" in tag .text .lower ())
223271 video = video .get ("href" ) if video else ""
224-
272+
225273 files = [f"{ slug } .{ platform .lower ()} " ]
226274
227- return Production (title , slug , developer , platform , typetag , screenshots , files , video , date = release_date , repository = source , url = demozoo_url )
275+ return Production (
276+ title ,
277+ slug ,
278+ developer ,
279+ platform ,
280+ typetag ,
281+ screenshots ,
282+ files ,
283+ video ,
284+ date = release_date ,
285+ repository = source ,
286+ url = demozoo_url ,
287+ )
288+
228289
229290def main ():
230291 for platform in PLATFORMS .keys ():
231- logger .write ("[INFO]" ,"Parsing platform: " + platform )
292+ logger .write ("[INFO]" , "Parsing platform: " + platform )
232293 scrape (platform )
233-
294+
295+
234296main ()
235297
236298if utils .DEBUG :
237- [ logger .write ("[TITLE]" , f ) for f in added ]
299+ [logger .write ("[TITLE]" , f ) for f in added ]
238300
239- logger .write ("[INFO]" , "demozoo importer ended!" )
301+ logger .write ("[INFO]" , "demozoo importer ended!" )
0 commit comments