add and fix support for specifying quarter

TimidRobot · TimidRobot · commit 5a1e1625b0af · 2025-01-06T14:44:43.000-08:00
diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
@@ -26,13 +26,6 @@
 LOGGER, PATHS = shared.setup(__file__)
 
 # Constants
-FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
-FILE2_LANGUAGE = shared.path_join(
-    PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
-)
-FILE3_COUNTRY = shared.path_join(
-    PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
-)
 QUARTER = os.path.basename(PATHS["data_quarter"])
 
 
@@ -42,19 +35,28 @@ def parse_arguments():
     """
     LOGGER.info("Parsing command-line options")
     parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
+    )
     parser.add_argument(
         "--enable-save",
         action="store_true",
-        help="Enable saving results",
+        help="Enable saving results (default: False)",
     )
     parser.add_argument(
         "--enable-git",
         action="store_true",
-        help="Enable git actions (fetch, merge, add, commit, and push)",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
     )
     args = parser.parse_args()
     if not args.enable_save and args.enable_git:
         parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.update_paths(LOGGER, PATHS, QUARTER, args.quarter)
     args.logger = LOGGER
     args.paths = PATHS
     return args
@@ -302,150 +304,39 @@ def process_totals_by_country(args, data):
     data_to_csv(args, data, file_path)
 
 
-# def load_quarter_data(quarter):
-#     """
-#     Load data for a specific quarter.
-#     """
-#     file_path = os.path.join(PATHS["data"], f"{quarter}",
-#       "1-fetch", "gcs_fetched.csv")
-#     if not os.path.exists(file_path):
-#         LOGGER.error(f"Data file for quarter {quarter} not found.")
-#         return None
-#     return pd.read_csv(file_path)
-
-
-# def compare_data(current_quarter, previous_quarter):
-#     """
-#     Compare data between two quarters.
-#     """
-#     current_data = load_quarter_data(current_quarter)
-#     previous_data = load_quarter_data(previous_quarter)
-
-#     if current_data is None or previous_data is None:
-#         return
-
-#     # Process the data to compare by country
-#     compare_by_country(current_data, previous_data,
-#     current_quarter, previous_quarter)
-
-#     # Process the data to compare by license
-#     compare_by_license(current_data, previous_data,
-#       current_quarter, previous_quarter)
-
-#     # Process the data to compare by language
-#     compare_by_language(current_data, previous_data,
-#       current_quarter, previous_quarter)
-
-
-# def compare_by_country(current_data, previous_data,
-#         current_quarter, previous_quarter):
-#     """
-#     Compare the number of webpages licensed by country between two quarters.
-#     """
-#     LOGGER.info(f"Comparing data by country between
-#       {current_quarter} and {previous_quarter}.")
-
-#     # Get the list of country columns dynamically
-#     columns = [col.strip() for col in current_data.columns.tolist()]
-#     start_index = columns.index("United States")
-#     end_index = columns.index("Japan") + 1
-
-#     countries = columns[start_index:end_index]
-
-#     current_country_data = current_data[countries].sum()
-#     previous_country_data = previous_data[countries].sum()
-
-#     comparison = pd.DataFrame({
-#         'Country': countries,
-#         f'{current_quarter}': current_country_data.values,
-#         f'{previous_quarter}': previous_country_data.values,
-#         'Difference': current_country_data.values
-#            - previous_country_data.values
-#     })
-
-#     LOGGER.info(f"Country comparison:\n{comparison}")
-
-#     # Visualization code to be added here
-
-
-# def compare_by_license(current_data, previous_data,
-#   current_quarter, previous_quarter):
-#     """
-#     Compare the number of webpages licensed by license type
-#   between two quarters.
-#     """
-#     LOGGER.info(f"Comparing data by license type
-#       between {current_quarter} and {previous_quarter}.")
-
-#     current_license_data =
-#       current_data.groupby('LICENSE TYPE').sum().sum(axis=1)
-#     previous_license_data =
-#       previous_data.groupby('LICENSE TYPE').sum().sum(axis=1)
-
-#     comparison = pd.DataFrame({
-#         'License Type': current_license_data.index,
-#         f'{current_quarter}': current_license_data.values,
-#         f'{previous_quarter}': previous_license_data.values,
-#         'Difference': current_license_data.values
-#           - previous_license_data.values
-#     })
-
-#     LOGGER.info(f"License type comparison:\n{comparison}")
-
-#     # Visualization code to be added here
-
-
-# def compare_by_language(current_data, previous_data,
-#           current_quarter, previous_quarter):
-#     """
-#     Compare the number of webpages licensed by language between two quarters.
-#     """
-#     LOGGER.info(f"Comparing data by language between
-#                   {current_quarter} and {previous_quarter}.")
-
-#     # Get the list of language columns dynamically
-#     columns = [col.strip() for col in current_data.columns.tolist()]
-#     start_index = columns.index("English")
-#     languages = columns[start_index:]
-
-#     current_language_data = current_data[languages].sum()
-#     previous_language_data = previous_data[languages].sum()
-
-#     comparison = pd.DataFrame({
-#         'Language': languages,
-#         f'{current_quarter}': current_language_data.values,
-#         f'{previous_quarter}': previous_language_data.values,
-#         'Difference': current_language_data.values
-#           - previous_language_data.values
-#     })
-
-#     LOGGER.info(f"Language comparison:\n{comparison}")
-
-
 def main():
     args = parse_arguments()
     shared.log_paths(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
 
     # Count data
-    count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
+    count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
     process_product_totals(args, count_data)
     process_current_old_retired_totals(args, count_data)
     process_totals_by_free_cultural(args, count_data)
     process_totals_by_restrictions(args, count_data)
 
     # Langauge data
+    file2_language = shared.path_join(
+        PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
+    )
     language_data = pd.read_csv(
-        FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
+        file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
     )
     process_totals_by_language(args, language_data)
 
     # Country data
+    file3_country = shared.path_join(
+        PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
+    )
     country_data = pd.read_csv(
-        FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
+        file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
     )
     process_totals_by_country(args, country_data)
 
+    # TODO: compare with previous quarter, previous year
+
     args = shared.git_add_and_commit(
         args,
         PATHS["repo"],
diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py
@@ -42,26 +42,30 @@ def parse_arguments():
     parser.add_argument(
         "--quarter",
         default=QUARTER,
-        help="Data quarter in format YYYYQx, e.g., 2024Q2",
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
     )
     parser.add_argument(
         "--show-plots",
         action="store_true",
-        help="Show generated plots (in addition to saving them)",
+        help="Show generated plots (default: False)",
     )
     parser.add_argument(
         "--enable-save",
         action="store_true",
-        help="Enable saving results",
+        help="Enable saving results (default: False)",
     )
     parser.add_argument(
         "--enable-git",
         action="store_true",
-        help="Enable git actions (fetch, merge, add, commit, and push)",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
     )
     args = parser.parse_args()
     if not args.enable_save and args.enable_git:
         parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.update_paths(LOGGER, PATHS, QUARTER, args.quarter)
     args.logger = LOGGER
     args.paths = PATHS
     return args
@@ -203,7 +207,8 @@ def gcs_intro(args):
     """
     LOGGER.info(gcs_intro.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"], args.quarter, "2-process", "gcs_product_totals.csv"
+        PATHS["data_2-process"],
+        "gcs_product_totals.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool product"
@@ -234,7 +239,7 @@ def plot_products(args):
     """
     LOGGER.info(plot_products.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"], args.quarter, "2-process", "gcs_product_totals.csv"
+        PATHS["data_2-process"], "gcs_product_totals.csv"
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool product"
@@ -278,9 +283,7 @@ def plot_tool_status(args):
     """
     LOGGER.info(plot_tool_status.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"],
-        args.quarter,
-        "2-process",
+        PATHS["data_2-process"],
         "gcs_status_combined_totals.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
@@ -323,9 +326,7 @@ def plot_current_tools(args):
     """
     LOGGER.info(plot_current_tools.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"],
-        args.quarter,
-        "2-process",
+        PATHS["data_2-process"],
         "gcs_status_current_totals.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
@@ -368,7 +369,7 @@ def plot_old_tools(args):
     """
     LOGGER.info(plot_old_tools.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"], args.quarter, "2-process", "gcs_status_old_totals.csv"
+        PATHS["data_2-process"], "gcs_status_old_totals.csv"
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
@@ -412,9 +413,7 @@ def plot_retired_tools(args):
     """
     LOGGER.info(plot_retired_tools.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"],
-        args.quarter,
-        "2-process",
+        PATHS["data_2-process"],
         "gcs_status_retired_totals.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
@@ -460,7 +459,7 @@ def plot_countries_highest_usage(args):
     """
     LOGGER.info(plot_countries_highest_usage.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"], args.quarter, "2-process", "gcs_totals_by_country.csv"
+        PATHS["data_2-process"], "gcs_totals_by_country.csv"
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Country"
@@ -513,7 +512,7 @@ def plot_languages_highest_usage(args):
     """
     LOGGER.info(plot_languages_highest_usage.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"], args.quarter, "2-process", "gcs_totals_by_language.csv"
+        PATHS["data_2-process"], "gcs_totals_by_language.csv"
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Language"
@@ -566,9 +565,7 @@ def plot_free_culture(args):
     """
     LOGGER.info(plot_free_culture.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data"],
-        args.quarter,
-        "2-process",
+        PATHS["data_2-process"],
         "gcs_totals_by_free_cultural.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
diff --git a/scripts/shared.py b/scripts/shared.py
@@ -48,6 +48,19 @@ def setup(current_file):
     return logger, paths
 
 
+def update_paths(logger, paths, old_quarter, new_quarter):
+    logger.info(f"Updating paths: replacing {old_quarter} with {new_quarter}")
+    for label in [
+        "data_1-fetch",
+        "data_2-process",
+        "data_3-report",
+        "data_phase",
+        "data_quarter",
+    ]:
+        paths[label] = paths[label].replace(old_quarter, new_quarter)
+    return paths
+
+
 def log_paths(logger, paths):
     paths_list = []
     repo_path = paths["repo"]