Skip to content

Commit 7beebb3

Browse files
committed
Github processing and reporting
1 parent 223c043 commit 7beebb3

2 files changed

Lines changed: 151 additions & 7 deletions

File tree

scripts/2-process/github_process.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def process_totals_by_code_license(args, count_data):
8383
tool = str(row.TOOL_IDENTIFIER)
8484
count = int(row.COUNT)
8585

86-
if tool == "Total_repositories":
86+
if tool == "Total public repositories":
8787
continue
8888

8989
if tool in [
@@ -107,6 +107,31 @@ def process_totals_by_code_license(args, count_data):
107107
data_to_csv(args, data, file_path)
108108

109109

110+
def process_totals_by_license(args, count_data):
111+
"""
112+
Processing count data: totals by License
113+
"""
114+
LOGGER.info(process_totals_by_license.__doc__.strip())
115+
data = {}
116+
117+
for row in count_data.itertuples(index=False):
118+
tool = str(row.TOOL_IDENTIFIER)
119+
count = int(row.COUNT)
120+
121+
if tool == "Total public repositories":
122+
continue
123+
124+
data[tool] = count
125+
126+
data = pd.DataFrame(data.items(), columns=["License", "Count"])
127+
data.sort_values("Count", ascending=False, inplace=True)
128+
data.reset_index(drop=True, inplace=True)
129+
file_path = shared.path_join(
130+
PATHS["data_phase"], "github_totals_by_license.csv"
131+
)
132+
data_to_csv(args, data, file_path)
133+
134+
110135
def process_totals_by_restriction(args, count_data):
111136
"""
112137
Processing count data: totals by Approved for Free Cultural Works
@@ -119,6 +144,9 @@ def process_totals_by_restriction(args, count_data):
119144
tool = str(row.TOOL_IDENTIFIER)
120145
count = int(row.COUNT)
121146

147+
if tool == "Total public repositories":
148+
continue
149+
122150
if tool in ["BSD Zero Clause License", "CC0 1.0", "Unlicense"]:
123151
key = "Public domain"
124152
elif tool in ["MIT No Attribution", "CC BY 4.0"]:
@@ -138,6 +166,44 @@ def process_totals_by_restriction(args, count_data):
138166
data_to_csv(args, data, file_path)
139167

140168

169+
def process_totals_by_rights_reserved(args, count_data):
170+
"""
171+
Processing count data: totals by Rights Reserved
172+
"""
173+
LOGGER.info(process_totals_by_rights_reserved.__doc__.strip())
174+
data = {
175+
"Rights reserved": 0,
176+
"No rights reserved": 0,
177+
}
178+
for row in count_data.itertuples(index=False):
179+
tool = str(row.TOOL_IDENTIFIER)
180+
count = int(row.COUNT)
181+
182+
if tool == "Total public repositories":
183+
continue
184+
185+
if tool in [
186+
"MIT No Attribution",
187+
"BSD Zero Clause License",
188+
"CC0 1.0",
189+
"Unlicense",
190+
]:
191+
key = "No rights reserved"
192+
elif tool in ["CC BY 4.0", "CC-BY-SA 4.0"]:
193+
key = "Rights reserved"
194+
else:
195+
continue
196+
197+
data[key] += count
198+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
199+
data.sort_values("Count", ascending=False, inplace=True)
200+
data.reset_index(drop=True, inplace=True)
201+
file_path = shared.path_join(
202+
PATHS["data_phase"], "github_totals_by_rights_reserved.csv"
203+
)
204+
data_to_csv(args, data, file_path)
205+
206+
141207
# def load_quarter_data(quarter):
142208
# """
143209
# Load data for a specific quarter.
@@ -188,6 +254,8 @@ def main():
188254

189255
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
190256
count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
257+
process_totals_by_license(args, count_data)
258+
process_totals_by_rights_reserved(args, count_data)
191259
process_totals_by_restriction(args, count_data)
192260
process_totals_by_code_license(args, count_data)
193261

scripts/3-report/github_report.py

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,18 +86,51 @@ def load_data(args):
8686
return data
8787

8888

89-
def Plot_by_license_type(args):
89+
def github_intro(args):
9090
"""
91-
Create plots for the languages with highest usage of latest tools
91+
Write Github introduction.
9292
"""
93-
LOGGER.info(plot_totals_by_code_license.__doc__.strip())
93+
LOGGER.info(github_intro.__doc__.strip())
9494
file_path = shared.path_join(
9595
PATHS["data_1-fetch"],
9696
"github_1_count.csv",
9797
)
9898
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
9999
name_label = "TOOL_IDENTIFIER"
100-
data_label = "COUNT"
100+
data = pd.read_csv(file_path, index_col=name_label)
101+
total_repositories = data.loc["Total public repositories", "COUNT"]
102+
shared.update_readme(
103+
args,
104+
SECTION,
105+
"Overview",
106+
None,
107+
None,
108+
"Github data uses the `total_count` returned by"
109+
" API for search queries of the SPDX IDENTIFIER URLS"
110+
"\n"
111+
f"**The results indicate that a total of {total_repositories}"
112+
"repositories on GitHub use a mix of some rights reserved and"
113+
"no rights reserved licenses which showcases the usage of"
114+
"attribution based Creative Commons (CC) legal tool"
115+
"and Public domain equivalent.**\n"
116+
"/n"
117+
"Thank you GitHub for providing public access to"
118+
"repository metadata through its API.",
119+
)
120+
121+
122+
def Plot_by_license_type(args):
123+
"""
124+
Create plots for the languages with highest usage of latest tools
125+
"""
126+
LOGGER.info(plot_totals_by_code_license.__doc__.strip())
127+
file_path = shared.path_join(
128+
PATHS["data_2-process"],
129+
"github_totals_by_license.csv",
130+
)
131+
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
132+
name_label = "License"
133+
data_label = "Count"
101134
data = pd.read_csv(file_path, index_col=name_label)
102135

103136
title = "Totals by license type"
@@ -166,7 +199,7 @@ def plot_totals_by_code_license(args):
166199
SECTION,
167200
title,
168201
image_path,
169-
"Plots showing totals by code license.",
202+
"Plots showing totals by code license vs content license.",
170203
)
171204

172205

@@ -184,7 +217,7 @@ def plot_totals_by_restriction(args):
184217
data_label = "Count"
185218
data = pd.read_csv(file_path, index_col=name_label)
186219

187-
title = "Approved for Free Cultural Works"
220+
title = "Totals by restriction"
188221
plt = plot.combined_plot(
189222
args=args,
190223
data=data,
@@ -211,13 +244,56 @@ def plot_totals_by_restriction(args):
211244
)
212245

213246

247+
def plot_totals_by_rights_reserved(args):
248+
"""
249+
Create plots for the languages with highest usage of latest tools
250+
"""
251+
LOGGER.info(plot_totals_by_rights_reserved.__doc__.strip())
252+
file_path = shared.path_join(
253+
PATHS["data_2-process"],
254+
"github_totals_by_rights_reserved.csv",
255+
)
256+
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
257+
name_label = "Category"
258+
data_label = "Count"
259+
data = pd.read_csv(file_path, index_col=name_label)
260+
261+
title = "Totals by Rights Reserved vs No Rights Reserved"
262+
plt = plot.combined_plot(
263+
args=args,
264+
data=data,
265+
title=title,
266+
name_label=name_label,
267+
data_label=data_label,
268+
)
269+
270+
image_path = shared.path_join(
271+
PATHS["data_phase"], "github_rights_reserved.png"
272+
)
273+
LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
274+
if args.enable_save:
275+
# Create the directory if it does not exist
276+
os.makedirs(PATHS["data_phase"], exist_ok=True)
277+
plt.savefig(image_path)
278+
279+
shared.update_readme(
280+
args,
281+
SECTION,
282+
title,
283+
image_path,
284+
"Plots showing totals by rights reserved vs No rights reserved.",
285+
)
286+
287+
214288
def main():
215289
args = parse_arguments()
216290
shared.paths_log(LOGGER, PATHS)
217291
shared.git_fetch_and_merge(args, PATHS["repo"])
292+
github_intro(args)
218293
plot_totals_by_restriction(args)
219294
plot_totals_by_code_license(args)
220295
Plot_by_license_type(args)
296+
plot_totals_by_rights_reserved(args)
221297

222298
# Add and commit changes
223299
args = shared.git_add_and_commit(

0 commit comments

Comments
 (0)