2626LOGGER , PATHS = shared .setup (__file__ )
2727
2828# Constants
29- FILE1_COUNT = shared .path_join (PATHS ["data_1-fetch" ], "gcs_1_count.csv" )
30- FILE2_LANGUAGE = shared .path_join (
31- PATHS ["data_1-fetch" ], "gcs_2_count_by_language.csv"
32- )
33- FILE3_COUNTRY = shared .path_join (
34- PATHS ["data_1-fetch" ], "gcs_3_count_by_country.csv"
35- )
3629QUARTER = os .path .basename (PATHS ["data_quarter" ])
3730
3831
@@ -42,19 +35,28 @@ def parse_arguments():
4235 """
4336 LOGGER .info ("Parsing command-line options" )
4437 parser = argparse .ArgumentParser (description = __doc__ )
38+ parser .add_argument (
39+ "--quarter" ,
40+ default = QUARTER ,
41+ help = f"Data quarter in format YYYYQx (default: { QUARTER } )" ,
42+ )
4543 parser .add_argument (
4644 "--enable-save" ,
4745 action = "store_true" ,
48- help = "Enable saving results" ,
46+ help = "Enable saving results (default: False) " ,
4947 )
5048 parser .add_argument (
5149 "--enable-git" ,
5250 action = "store_true" ,
53- help = "Enable git actions (fetch, merge, add, commit, and push)" ,
51+ help = "Enable git actions such as fetch, merge, add, commit, and push"
52+ " (default: False)" ,
5453 )
5554 args = parser .parse_args ()
5655 if not args .enable_save and args .enable_git :
5756 parser .error ("--enable-git requires --enable-save" )
57+ if args .quarter != QUARTER :
58+ global PATHS
59+ PATHS = shared .update_paths (LOGGER , PATHS , QUARTER , args .quarter )
5860 args .logger = LOGGER
5961 args .paths = PATHS
6062 return args
@@ -302,150 +304,39 @@ def process_totals_by_country(args, data):
302304 data_to_csv (args , data , file_path )
303305
304306
305- # def load_quarter_data(quarter):
306- # """
307- # Load data for a specific quarter.
308- # """
309- # file_path = os.path.join(PATHS["data"], f"{quarter}",
310- # "1-fetch", "gcs_fetched.csv")
311- # if not os.path.exists(file_path):
312- # LOGGER.error(f"Data file for quarter {quarter} not found.")
313- # return None
314- # return pd.read_csv(file_path)
315-
316-
317- # def compare_data(current_quarter, previous_quarter):
318- # """
319- # Compare data between two quarters.
320- # """
321- # current_data = load_quarter_data(current_quarter)
322- # previous_data = load_quarter_data(previous_quarter)
323-
324- # if current_data is None or previous_data is None:
325- # return
326-
327- # # Process the data to compare by country
328- # compare_by_country(current_data, previous_data,
329- # current_quarter, previous_quarter)
330-
331- # # Process the data to compare by license
332- # compare_by_license(current_data, previous_data,
333- # current_quarter, previous_quarter)
334-
335- # # Process the data to compare by language
336- # compare_by_language(current_data, previous_data,
337- # current_quarter, previous_quarter)
338-
339-
340- # def compare_by_country(current_data, previous_data,
341- # current_quarter, previous_quarter):
342- # """
343- # Compare the number of webpages licensed by country between two quarters.
344- # """
345- # LOGGER.info(f"Comparing data by country between
346- # {current_quarter} and {previous_quarter}.")
347-
348- # # Get the list of country columns dynamically
349- # columns = [col.strip() for col in current_data.columns.tolist()]
350- # start_index = columns.index("United States")
351- # end_index = columns.index("Japan") + 1
352-
353- # countries = columns[start_index:end_index]
354-
355- # current_country_data = current_data[countries].sum()
356- # previous_country_data = previous_data[countries].sum()
357-
358- # comparison = pd.DataFrame({
359- # 'Country': countries,
360- # f'{current_quarter}': current_country_data.values,
361- # f'{previous_quarter}': previous_country_data.values,
362- # 'Difference': current_country_data.values
363- # - previous_country_data.values
364- # })
365-
366- # LOGGER.info(f"Country comparison:\n{comparison}")
367-
368- # # Visualization code to be added here
369-
370-
371- # def compare_by_license(current_data, previous_data,
372- # current_quarter, previous_quarter):
373- # """
374- # Compare the number of webpages licensed by license type
375- # between two quarters.
376- # """
377- # LOGGER.info(f"Comparing data by license type
378- # between {current_quarter} and {previous_quarter}.")
379-
380- # current_license_data =
381- # current_data.groupby('LICENSE TYPE').sum().sum(axis=1)
382- # previous_license_data =
383- # previous_data.groupby('LICENSE TYPE').sum().sum(axis=1)
384-
385- # comparison = pd.DataFrame({
386- # 'License Type': current_license_data.index,
387- # f'{current_quarter}': current_license_data.values,
388- # f'{previous_quarter}': previous_license_data.values,
389- # 'Difference': current_license_data.values
390- # - previous_license_data.values
391- # })
392-
393- # LOGGER.info(f"License type comparison:\n{comparison}")
394-
395- # # Visualization code to be added here
396-
397-
398- # def compare_by_language(current_data, previous_data,
399- # current_quarter, previous_quarter):
400- # """
401- # Compare the number of webpages licensed by language between two quarters.
402- # """
403- # LOGGER.info(f"Comparing data by language between
404- # {current_quarter} and {previous_quarter}.")
405-
406- # # Get the list of language columns dynamically
407- # columns = [col.strip() for col in current_data.columns.tolist()]
408- # start_index = columns.index("English")
409- # languages = columns[start_index:]
410-
411- # current_language_data = current_data[languages].sum()
412- # previous_language_data = previous_data[languages].sum()
413-
414- # comparison = pd.DataFrame({
415- # 'Language': languages,
416- # f'{current_quarter}': current_language_data.values,
417- # f'{previous_quarter}': previous_language_data.values,
418- # 'Difference': current_language_data.values
419- # - previous_language_data.values
420- # })
421-
422- # LOGGER.info(f"Language comparison:\n{comparison}")
423-
424-
425307def main ():
426308 args = parse_arguments ()
427309 shared .log_paths (LOGGER , PATHS )
428310 shared .git_fetch_and_merge (args , PATHS ["repo" ])
429311
430312 # Count data
431- count_data = pd .read_csv (FILE1_COUNT , usecols = ["TOOL_IDENTIFIER" , "COUNT" ])
313+ file1_count = shared .path_join (PATHS ["data_1-fetch" ], "gcs_1_count.csv" )
314+ count_data = pd .read_csv (file1_count , usecols = ["TOOL_IDENTIFIER" , "COUNT" ])
432315 process_product_totals (args , count_data )
433316 process_current_old_retired_totals (args , count_data )
434317 process_totals_by_free_cultural (args , count_data )
435318 process_totals_by_restrictions (args , count_data )
436319
437320 # Langauge data
321+ file2_language = shared .path_join (
322+ PATHS ["data_1-fetch" ], "gcs_2_count_by_language.csv"
323+ )
438324 language_data = pd .read_csv (
439- FILE2_LANGUAGE , usecols = ["TOOL_IDENTIFIER" , "LANGUAGE" , "COUNT" ]
325+ file2_language , usecols = ["TOOL_IDENTIFIER" , "LANGUAGE" , "COUNT" ]
440326 )
441327 process_totals_by_language (args , language_data )
442328
443329 # Country data
330+ file3_country = shared .path_join (
331+ PATHS ["data_1-fetch" ], "gcs_3_count_by_country.csv"
332+ )
444333 country_data = pd .read_csv (
445- FILE3_COUNTRY , usecols = ["TOOL_IDENTIFIER" , "COUNTRY" , "COUNT" ]
334+ file3_country , usecols = ["TOOL_IDENTIFIER" , "COUNTRY" , "COUNT" ]
446335 )
447336 process_totals_by_country (args , country_data )
448337
338+ # TODO: compare with previous quarter, previous year
339+
449340 args = shared .git_add_and_commit (
450341 args ,
451342 PATHS ["repo" ],
0 commit comments