Skip to content

Commit 223c043

Browse files
committed
Github processing and reporting
1 parent f868c41 commit 223c043

2 files changed

Lines changed: 289 additions & 105 deletions

File tree

scripts/2-process/github_process.py

Lines changed: 136 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@
44
for analysis and comparison between quarters.
55
"""
66
# Standard library
7+
import argparse
8+
import csv
79
import os
810
import sys
911
import traceback
1012

13+
# Third-party
1114
# import pandas as pd
15+
import pandas as pd
1216

1317
# Add parent directory so shared can be imported
1418
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +23,121 @@
1923
# Setup
2024
LOGGER, PATHS = shared.setup(__file__)
2125

26+
# Constants
27+
QUARTER = os.path.basename(PATHS["data_quarter"])
28+
29+
30+
def parse_arguments():
31+
"""
32+
Parse command-line options, returns parsed argument namespace.
33+
"""
34+
LOGGER.info("Parsing command-line options")
35+
parser = argparse.ArgumentParser(description=__doc__)
36+
parser.add_argument(
37+
"--quarter",
38+
default=QUARTER,
39+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
40+
)
41+
parser.add_argument(
42+
"--enable_save",
43+
action="store_true",
44+
help="Enable saving results (default: False)",
45+
)
46+
parser.add_argument(
47+
"--enable_git",
48+
action="store_true",
49+
help="Enable git actions such as fetch, merge, add, commit, and push"
50+
" (default: False)",
51+
)
52+
args = parser.parse_args()
53+
if not args.enable_save and args.enable_git:
54+
parser.error("--enable-git requires --enable-save")
55+
if args.quarter != QUARTER:
56+
global PATHS
57+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
58+
args.logger = LOGGER
59+
args.paths = PATHS
60+
return args
61+
62+
63+
def data_to_csv(args, data, file_path):
64+
if not args.enable_save:
65+
return
66+
os.makedirs(PATHS["data_phase"], exist_ok=True)
67+
# emulate csv.unix_dialect
68+
data.to_csv(
69+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
70+
)
71+
72+
73+
def process_totals_by_code_license(args, count_data):
74+
"""
75+
Processing count data: totals by Code License
76+
"""
77+
LOGGER.info(process_totals_by_code_license.__doc__.strip())
78+
data = {
79+
"Code License": 0,
80+
"Content License": 0,
81+
}
82+
for row in count_data.itertuples(index=False):
83+
tool = str(row.TOOL_IDENTIFIER)
84+
count = int(row.COUNT)
85+
86+
if tool == "Total_repositories":
87+
continue
88+
89+
if tool in [
90+
"MIT No Attribution",
91+
"BSD Zero Clause License",
92+
"Unlicense",
93+
]:
94+
key = "Code License"
95+
elif tool in ["CC0 1.0", "CC BY 4.0", "CC-BY-SA 4.0"]:
96+
key = "Content License"
97+
else:
98+
continue
99+
100+
data[key] += count
101+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
102+
data.sort_values("Count", ascending=False, inplace=True)
103+
data.reset_index(drop=True, inplace=True)
104+
file_path = shared.path_join(
105+
PATHS["data_phase"], "github_totals_by_code_license.csv"
106+
)
107+
data_to_csv(args, data, file_path)
108+
109+
110+
def process_totals_by_restriction(args, count_data):
111+
"""
112+
Processing count data: totals by Approved for Free Cultural Works
113+
"""
114+
# https://creativecommons.org/public-domain/freeworks/
115+
LOGGER.info(process_totals_by_restriction.__doc__.strip())
116+
data = {"Copyleft": 0, "Permissive": 0, "Public domain": 0}
117+
118+
for row in count_data.itertuples(index=False):
119+
tool = str(row.TOOL_IDENTIFIER)
120+
count = int(row.COUNT)
121+
122+
if tool in ["BSD Zero Clause License", "CC0 1.0", "Unlicense"]:
123+
key = "Public domain"
124+
elif tool in ["MIT No Attribution", "CC BY 4.0"]:
125+
key = "Permissive"
126+
elif tool in ["CC BY-SA 4.0"]:
127+
key = "Copyleft"
128+
else:
129+
continue
130+
131+
data[key] += count
132+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
133+
data.sort_values("Count", ascending=False, inplace=True)
134+
data.reset_index(drop=True, inplace=True)
135+
file_path = shared.path_join(
136+
PATHS["data_phase"], "github_totals_by_restriction.csv"
137+
)
138+
data_to_csv(args, data, file_path)
139+
140+
22141
# def load_quarter_data(quarter):
23142
# """
24143
# Load data for a specific quarter.
@@ -63,18 +182,23 @@
63182

64183

65184
def main():
66-
raise shared.QuantifyingException("No current code for Phase 2", 0)
67-
68-
# # Fetch and merge changes
69-
# shared.fetch_and_merge(PATHS["repo"])
70-
71-
# # Add and commit changes
72-
# shared.add_and_commit(
73-
# PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
74-
# )
75-
76-
# # Push changes
77-
# shared.push_changes(PATHS["repo"])
185+
args = parse_arguments()
186+
shared.paths_log(LOGGER, PATHS)
187+
shared.git_fetch_and_merge(args, PATHS["repo"])
188+
189+
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
190+
count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
191+
process_totals_by_restriction(args, count_data)
192+
process_totals_by_code_license(args, count_data)
193+
194+
# Push changes
195+
args = shared.git_add_and_commit(
196+
args,
197+
PATHS["repo"],
198+
PATHS["data_quarter"],
199+
f"Add and commit new Github data for {QUARTER}",
200+
)
201+
shared.git_push_changes(args, PATHS["repo"])
78202

79203

80204
if __name__ == "__main__":

0 commit comments

Comments
 (0)