Skip to content

Commit 5a1e162

Browse files
committed
add and fix support for specifying quarter
1 parent a734985 commit 5a1e162

3 files changed

Lines changed: 54 additions & 153 deletions

File tree

scripts/2-process/gcs_process.py

Lines changed: 23 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,6 @@
2626
LOGGER, PATHS = shared.setup(__file__)
2727

2828
# Constants
29-
FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
30-
FILE2_LANGUAGE = shared.path_join(
31-
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
32-
)
33-
FILE3_COUNTRY = shared.path_join(
34-
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
35-
)
3629
QUARTER = os.path.basename(PATHS["data_quarter"])
3730

3831

@@ -42,19 +35,28 @@ def parse_arguments():
4235
"""
4336
LOGGER.info("Parsing command-line options")
4437
parser = argparse.ArgumentParser(description=__doc__)
38+
parser.add_argument(
39+
"--quarter",
40+
default=QUARTER,
41+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
42+
)
4543
parser.add_argument(
4644
"--enable-save",
4745
action="store_true",
48-
help="Enable saving results",
46+
help="Enable saving results (default: False)",
4947
)
5048
parser.add_argument(
5149
"--enable-git",
5250
action="store_true",
53-
help="Enable git actions (fetch, merge, add, commit, and push)",
51+
help="Enable git actions such as fetch, merge, add, commit, and push"
52+
" (default: False)",
5453
)
5554
args = parser.parse_args()
5655
if not args.enable_save and args.enable_git:
5756
parser.error("--enable-git requires --enable-save")
57+
if args.quarter != QUARTER:
58+
global PATHS
59+
PATHS = shared.update_paths(LOGGER, PATHS, QUARTER, args.quarter)
5860
args.logger = LOGGER
5961
args.paths = PATHS
6062
return args
@@ -302,150 +304,39 @@ def process_totals_by_country(args, data):
302304
data_to_csv(args, data, file_path)
303305

304306

305-
# def load_quarter_data(quarter):
306-
# """
307-
# Load data for a specific quarter.
308-
# """
309-
# file_path = os.path.join(PATHS["data"], f"{quarter}",
310-
# "1-fetch", "gcs_fetched.csv")
311-
# if not os.path.exists(file_path):
312-
# LOGGER.error(f"Data file for quarter {quarter} not found.")
313-
# return None
314-
# return pd.read_csv(file_path)
315-
316-
317-
# def compare_data(current_quarter, previous_quarter):
318-
# """
319-
# Compare data between two quarters.
320-
# """
321-
# current_data = load_quarter_data(current_quarter)
322-
# previous_data = load_quarter_data(previous_quarter)
323-
324-
# if current_data is None or previous_data is None:
325-
# return
326-
327-
# # Process the data to compare by country
328-
# compare_by_country(current_data, previous_data,
329-
# current_quarter, previous_quarter)
330-
331-
# # Process the data to compare by license
332-
# compare_by_license(current_data, previous_data,
333-
# current_quarter, previous_quarter)
334-
335-
# # Process the data to compare by language
336-
# compare_by_language(current_data, previous_data,
337-
# current_quarter, previous_quarter)
338-
339-
340-
# def compare_by_country(current_data, previous_data,
341-
# current_quarter, previous_quarter):
342-
# """
343-
# Compare the number of webpages licensed by country between two quarters.
344-
# """
345-
# LOGGER.info(f"Comparing data by country between
346-
# {current_quarter} and {previous_quarter}.")
347-
348-
# # Get the list of country columns dynamically
349-
# columns = [col.strip() for col in current_data.columns.tolist()]
350-
# start_index = columns.index("United States")
351-
# end_index = columns.index("Japan") + 1
352-
353-
# countries = columns[start_index:end_index]
354-
355-
# current_country_data = current_data[countries].sum()
356-
# previous_country_data = previous_data[countries].sum()
357-
358-
# comparison = pd.DataFrame({
359-
# 'Country': countries,
360-
# f'{current_quarter}': current_country_data.values,
361-
# f'{previous_quarter}': previous_country_data.values,
362-
# 'Difference': current_country_data.values
363-
# - previous_country_data.values
364-
# })
365-
366-
# LOGGER.info(f"Country comparison:\n{comparison}")
367-
368-
# # Visualization code to be added here
369-
370-
371-
# def compare_by_license(current_data, previous_data,
372-
# current_quarter, previous_quarter):
373-
# """
374-
# Compare the number of webpages licensed by license type
375-
# between two quarters.
376-
# """
377-
# LOGGER.info(f"Comparing data by license type
378-
# between {current_quarter} and {previous_quarter}.")
379-
380-
# current_license_data =
381-
# current_data.groupby('LICENSE TYPE').sum().sum(axis=1)
382-
# previous_license_data =
383-
# previous_data.groupby('LICENSE TYPE').sum().sum(axis=1)
384-
385-
# comparison = pd.DataFrame({
386-
# 'License Type': current_license_data.index,
387-
# f'{current_quarter}': current_license_data.values,
388-
# f'{previous_quarter}': previous_license_data.values,
389-
# 'Difference': current_license_data.values
390-
# - previous_license_data.values
391-
# })
392-
393-
# LOGGER.info(f"License type comparison:\n{comparison}")
394-
395-
# # Visualization code to be added here
396-
397-
398-
# def compare_by_language(current_data, previous_data,
399-
# current_quarter, previous_quarter):
400-
# """
401-
# Compare the number of webpages licensed by language between two quarters.
402-
# """
403-
# LOGGER.info(f"Comparing data by language between
404-
# {current_quarter} and {previous_quarter}.")
405-
406-
# # Get the list of language columns dynamically
407-
# columns = [col.strip() for col in current_data.columns.tolist()]
408-
# start_index = columns.index("English")
409-
# languages = columns[start_index:]
410-
411-
# current_language_data = current_data[languages].sum()
412-
# previous_language_data = previous_data[languages].sum()
413-
414-
# comparison = pd.DataFrame({
415-
# 'Language': languages,
416-
# f'{current_quarter}': current_language_data.values,
417-
# f'{previous_quarter}': previous_language_data.values,
418-
# 'Difference': current_language_data.values
419-
# - previous_language_data.values
420-
# })
421-
422-
# LOGGER.info(f"Language comparison:\n{comparison}")
423-
424-
425307
def main():
426308
args = parse_arguments()
427309
shared.log_paths(LOGGER, PATHS)
428310
shared.git_fetch_and_merge(args, PATHS["repo"])
429311

430312
# Count data
431-
count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"])
313+
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
314+
count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
432315
process_product_totals(args, count_data)
433316
process_current_old_retired_totals(args, count_data)
434317
process_totals_by_free_cultural(args, count_data)
435318
process_totals_by_restrictions(args, count_data)
436319

437320
# Langauge data
321+
file2_language = shared.path_join(
322+
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
323+
)
438324
language_data = pd.read_csv(
439-
FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
325+
file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
440326
)
441327
process_totals_by_language(args, language_data)
442328

443329
# Country data
330+
file3_country = shared.path_join(
331+
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
332+
)
444333
country_data = pd.read_csv(
445-
FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
334+
file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
446335
)
447336
process_totals_by_country(args, country_data)
448337

338+
# TODO: compare with previous quarter, previous year
339+
449340
args = shared.git_add_and_commit(
450341
args,
451342
PATHS["repo"],

scripts/3-report/gcs_report.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,26 +42,30 @@ def parse_arguments():
4242
parser.add_argument(
4343
"--quarter",
4444
default=QUARTER,
45-
help="Data quarter in format YYYYQx, e.g., 2024Q2",
45+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
4646
)
4747
parser.add_argument(
4848
"--show-plots",
4949
action="store_true",
50-
help="Show generated plots (in addition to saving them)",
50+
help="Show generated plots (default: False)",
5151
)
5252
parser.add_argument(
5353
"--enable-save",
5454
action="store_true",
55-
help="Enable saving results",
55+
help="Enable saving results (default: False)",
5656
)
5757
parser.add_argument(
5858
"--enable-git",
5959
action="store_true",
60-
help="Enable git actions (fetch, merge, add, commit, and push)",
60+
help="Enable git actions such as fetch, merge, add, commit, and push"
61+
" (default: False)",
6162
)
6263
args = parser.parse_args()
6364
if not args.enable_save and args.enable_git:
6465
parser.error("--enable-git requires --enable-save")
66+
if args.quarter != QUARTER:
67+
global PATHS
68+
PATHS = shared.update_paths(LOGGER, PATHS, QUARTER, args.quarter)
6569
args.logger = LOGGER
6670
args.paths = PATHS
6771
return args
@@ -203,7 +207,8 @@ def gcs_intro(args):
203207
"""
204208
LOGGER.info(gcs_intro.__doc__.strip())
205209
file_path = shared.path_join(
206-
PATHS["data"], args.quarter, "2-process", "gcs_product_totals.csv"
210+
PATHS["data_2-process"],
211+
"gcs_product_totals.csv",
207212
)
208213
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
209214
name_label = "CC legal tool product"
@@ -234,7 +239,7 @@ def plot_products(args):
234239
"""
235240
LOGGER.info(plot_products.__doc__.strip())
236241
file_path = shared.path_join(
237-
PATHS["data"], args.quarter, "2-process", "gcs_product_totals.csv"
242+
PATHS["data_2-process"], "gcs_product_totals.csv"
238243
)
239244
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
240245
name_label = "CC legal tool product"
@@ -278,9 +283,7 @@ def plot_tool_status(args):
278283
"""
279284
LOGGER.info(plot_tool_status.__doc__.strip())
280285
file_path = shared.path_join(
281-
PATHS["data"],
282-
args.quarter,
283-
"2-process",
286+
PATHS["data_2-process"],
284287
"gcs_status_combined_totals.csv",
285288
)
286289
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
@@ -323,9 +326,7 @@ def plot_current_tools(args):
323326
"""
324327
LOGGER.info(plot_current_tools.__doc__.strip())
325328
file_path = shared.path_join(
326-
PATHS["data"],
327-
args.quarter,
328-
"2-process",
329+
PATHS["data_2-process"],
329330
"gcs_status_current_totals.csv",
330331
)
331332
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
@@ -368,7 +369,7 @@ def plot_old_tools(args):
368369
"""
369370
LOGGER.info(plot_old_tools.__doc__.strip())
370371
file_path = shared.path_join(
371-
PATHS["data"], args.quarter, "2-process", "gcs_status_old_totals.csv"
372+
PATHS["data_2-process"], "gcs_status_old_totals.csv"
372373
)
373374
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
374375
name_label = "CC legal tool"
@@ -412,9 +413,7 @@ def plot_retired_tools(args):
412413
"""
413414
LOGGER.info(plot_retired_tools.__doc__.strip())
414415
file_path = shared.path_join(
415-
PATHS["data"],
416-
args.quarter,
417-
"2-process",
416+
PATHS["data_2-process"],
418417
"gcs_status_retired_totals.csv",
419418
)
420419
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
@@ -460,7 +459,7 @@ def plot_countries_highest_usage(args):
460459
"""
461460
LOGGER.info(plot_countries_highest_usage.__doc__.strip())
462461
file_path = shared.path_join(
463-
PATHS["data"], args.quarter, "2-process", "gcs_totals_by_country.csv"
462+
PATHS["data_2-process"], "gcs_totals_by_country.csv"
464463
)
465464
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
466465
name_label = "Country"
@@ -513,7 +512,7 @@ def plot_languages_highest_usage(args):
513512
"""
514513
LOGGER.info(plot_languages_highest_usage.__doc__.strip())
515514
file_path = shared.path_join(
516-
PATHS["data"], args.quarter, "2-process", "gcs_totals_by_language.csv"
515+
PATHS["data_2-process"], "gcs_totals_by_language.csv"
517516
)
518517
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
519518
name_label = "Language"
@@ -566,9 +565,7 @@ def plot_free_culture(args):
566565
"""
567566
LOGGER.info(plot_free_culture.__doc__.strip())
568567
file_path = shared.path_join(
569-
PATHS["data"],
570-
args.quarter,
571-
"2-process",
568+
PATHS["data_2-process"],
572569
"gcs_totals_by_free_cultural.csv",
573570
)
574571
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")

scripts/shared.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,19 @@ def setup(current_file):
4848
return logger, paths
4949

5050

51+
def update_paths(logger, paths, old_quarter, new_quarter):
52+
logger.info(f"Updating paths: replacing {old_quarter} with {new_quarter}")
53+
for label in [
54+
"data_1-fetch",
55+
"data_2-process",
56+
"data_3-report",
57+
"data_phase",
58+
"data_quarter",
59+
]:
60+
paths[label] = paths[label].replace(old_quarter, new_quarter)
61+
return paths
62+
63+
5164
def log_paths(logger, paths):
5265
paths_list = []
5366
repo_path = paths["repo"]

0 commit comments

Comments
 (0)