Skip to content

Commit e72f9a6

Browse files
committed
update per isort, black, and flake8
1 parent 0326b9a commit e72f9a6

3 files changed

Lines changed: 434 additions & 318 deletions

File tree

model_sampling/dataset_sampling.ipynb

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,11 @@
179179
],
180180
"source": [
181181
"web_contents = requests.get(example_response_df.loc[0, \"link\"]).text\n",
182-
"encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n",
182+
"encoding = EncodingDetector.find_declared_encoding(web_contents, is_html=True)\n",
183183
"soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n",
184184
"for script in soup([\"script\", \"style\"]):\n",
185185
" script.extract()\n",
186-
"parse_result = soup.get_text(\"\", strip = True)\n",
186+
"parse_result = soup.get_text(\"\", strip=True)\n",
187187
"print(f\"title: {example_response_df.loc[0, 'title']}\")\n",
188188
"print(f\"url: {example_response_df.loc[0, 'link']}\")\n",
189189
"print(f\"contents\\n: {parse_result[:500]}\")"
@@ -224,15 +224,17 @@
224224
"source": [
225225
"for i in example_response_df.index:\n",
226226
" web_contents = requests.get(example_response_df.loc[i, \"link\"]).text\n",
227-
" encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n",
227+
" encoding = EncodingDetector.find_declared_encoding(\n",
228+
" web_contents, is_html=True\n",
229+
" )\n",
228230
" soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n",
229231
" for script in soup([\"script\", \"style\"]):\n",
230232
" script.extract()\n",
231-
" parse_result = soup.get_text(\"\", strip = True)\n",
233+
" parse_result = soup.get_text(\"\", strip=True)\n",
232234
" print(f\"entry {i}\")\n",
233235
" print(f\"title: {soup.title}\")\n",
234-
" '''print(f\"url: {example_response_df.loc[i, 'link']}\")\n",
235-
" print(f\"contents\\n: {parse_result[:500]}\")'''"
236+
" \"\"\"print(f\"url: {example_response_df.loc[i, 'link']}\")\n",
237+
" print(f\"contents\\n: {parse_result[:500]}\")\"\"\""
236238
]
237239
},
238240
{
@@ -373,11 +375,13 @@
373375
"sample_contents = []\n",
374376
"for address in sample_df[\"link\"]:\n",
375377
" web_contents = requests.get(address).text\n",
376-
" encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n",
378+
" encoding = EncodingDetector.find_declared_encoding(\n",
379+
" web_contents, is_html=True\n",
380+
" )\n",
377381
" soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n",
378382
" for script in soup([\"script\", \"style\"]):\n",
379383
" script.extract()\n",
380-
" parse_result = soup.get_text(\"\", strip = True)\n",
384+
" parse_result = soup.get_text(\"\", strip=True)\n",
381385
" sample_titles.append(soup.title)\n",
382386
" sample_contents.append(parse_result[:3000])\n",
383387
"sample_df[\"title\"] = sample_titles\n",
@@ -411,7 +415,7 @@
411415
}
412416
],
413417
"source": [
414-
"sample_df.to_sql(\"example\", engine, if_exists = 'append')"
418+
"sample_df.to_sql(\"example\", engine, if_exists=\"append\")"
415419
]
416420
},
417421
{
@@ -939,7 +943,7 @@
939943
" \"by-nc-sa\": \"licenses/by-nc-sa/\",\n",
940944
" \"by-nd\": \"licenses/by-nd/\",\n",
941945
" \"by-nc-nd\": \"licenses/by-nc-nd/|licenses/by-nd-nc/\",\n",
942-
" \"publicdomain\": \"publicdomain/\"\n",
946+
" \"publicdomain\": \"publicdomain/\",\n",
943947
"}\n",
944948
"license_list = pd.Series(\n",
945949
" cc_license_data[0]\n",
@@ -961,20 +965,27 @@
961965
"outputs": [],
962966
"source": [
963967
"import dataset_sampling\n",
968+
"\n",
964969
"license_map = dataset_sampling.get_license_map()\n",
965970
"license_ser = pd.concat([v for v in license_map.values()])\n",
966971
"license_ser_splits_df = license_ser.str.split(\"/\", expand=True)\n",
967972
"license_ser_splits_df = license_ser_splits_df.rename(\n",
968-
" columns = {\n",
973+
" columns={\n",
969974
" 0: \"Tool Typing\",\n",
970975
" 1: \"General Typing\",\n",
971976
" 2: \"Version\",\n",
972-
" 3: \"Jurisdiction\"\n",
977+
" 3: \"Jurisdiction\",\n",
973978
" }\n",
974979
")\n",
975-
"license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\"General Typing\"].str.replace(\"mark|zero\", \"publicdomain\", regex=True)\n",
976-
"license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\"General Typing\"].str.replace(\"by-nd-nc\", \"by-nc-nd\", regex=True)\n",
977-
"license_ser_splits_df[\"Version\"] = license_ser_splits_df[\"Version\"].astype(float)"
980+
"license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\n",
981+
" \"General Typing\"\n",
982+
"].str.replace(\"mark|zero\", \"publicdomain\", regex=True)\n",
983+
"license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\n",
984+
" \"General Typing\"\n",
985+
"].str.replace(\"by-nd-nc\", \"by-nc-nd\", regex=True)\n",
986+
"license_ser_splits_df[\"Version\"] = license_ser_splits_df[\"Version\"].astype(\n",
987+
" float\n",
988+
")"
978989
]
979990
},
980991
{
@@ -984,11 +995,21 @@
984995
"outputs": [],
985996
"source": [
986997
"license_one_hot_encoding = pd.DataFrame()\n",
987-
"license_one_hot_encoding[\"by\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"by\")\n",
988-
"license_one_hot_encoding[\"sa\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"sa\")\n",
989-
"license_one_hot_encoding[\"nc\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"nc\")\n",
990-
"license_one_hot_encoding[\"nd\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"nd\")\n",
991-
"license_not_six_type = license_ser_splits_df[\"General Typing\"].str.contains(\"by|sa|nc|nd\")\n",
998+
"license_one_hot_encoding[\"by\"] = license_ser_splits_df[\n",
999+
" \"General Typing\"\n",
1000+
"].str.contains(\"by\")\n",
1001+
"license_one_hot_encoding[\"sa\"] = license_ser_splits_df[\n",
1002+
" \"General Typing\"\n",
1003+
"].str.contains(\"sa\")\n",
1004+
"license_one_hot_encoding[\"nc\"] = license_ser_splits_df[\n",
1005+
" \"General Typing\"\n",
1006+
"].str.contains(\"nc\")\n",
1007+
"license_one_hot_encoding[\"nd\"] = license_ser_splits_df[\n",
1008+
" \"General Typing\"\n",
1009+
"].str.contains(\"nd\")\n",
1010+
"license_not_six_type = license_ser_splits_df[\"General Typing\"].str.contains(\n",
1011+
" \"by|sa|nc|nd\"\n",
1012+
")\n",
9921013
"license_one_hot_encoding[\"neither\"] = ~(license_not_six_type.fillna(False))"
9931014
]
9941015
},
@@ -998,8 +1019,9 @@
9981019
"metadata": {},
9991020
"outputs": [],
10001021
"source": [
1001-
"license_df = pd.concat([license_ser, license_ser_splits_df, license_one_hot_encoding], axis = 1)\\\n",
1002-
" .rename(columns = {0: \"license\"})"
1022+
"license_df = pd.concat(\n",
1023+
" [license_ser, license_ser_splits_df, license_one_hot_encoding], axis=1\n",
1024+
").rename(columns={0: \"license\"})"
10031025
]
10041026
},
10051027
{
@@ -1019,7 +1041,9 @@
10191041
}
10201042
],
10211043
"source": [
1022-
"sampling_engine = sqlalchemy.create_engine(f\"sqlite:///{CWD}/modeling_dataset.db\")\n",
1044+
"sampling_engine = sqlalchemy.create_engine(\n",
1045+
" f\"sqlite:///{CWD}/modeling_dataset.db\"\n",
1046+
")\n",
10231047
"sampling_engine.connect()"
10241048
]
10251049
},
@@ -1334,13 +1358,12 @@
13341358
}
13351359
],
13361360
"source": [
1337-
"tables = pd.read_sql(\"SELECT * FROM sqlite_master WHERE type='table'\", sampling_engine)\n",
1361+
"tables = pd.read_sql(\n",
1362+
" \"SELECT * FROM sqlite_master WHERE type='table'\", sampling_engine\n",
1363+
")\n",
13381364
"inspected_dataset = pd.concat(\n",
13391365
" [\n",
1340-
" pd.read_sql(\n",
1341-
" f\"SELECT * FROM '{license_type}'\",\n",
1342-
" sampling_engine\n",
1343-
" )\n",
1366+
" pd.read_sql(f\"SELECT * FROM '{license_type}'\", sampling_engine)\n",
13441367
" for license_type in tables[\"name\"]\n",
13451368
" ]\n",
13461369
")\n",
@@ -1540,14 +1563,18 @@
15401563
}
15411564
],
15421565
"source": [
1543-
"inspected_dataset_license = inspected_dataset.merge(license_df, on = \"license\")\n",
1544-
"inspected_dataset_license[\"parsed_title\"] = \\\n",
1545-
" inspected_dataset_license[\"title\"].str.extract(r\"<title>(.*)</title>\")\n",
1546-
"inspected_dataset_license[\"content_sum\"] = \\\n",
1547-
" inspected_dataset_license[\"url\"] + \\\n",
1548-
" inspected_dataset_license[\"parsed_title\"] + \\\n",
1549-
" inspected_dataset_license[\"contents\"]\n",
1550-
"inspected_dataset_license = inspected_dataset_license.groupby(\"content_sum\").agg(lambda x: x.iloc[0])\n",
1566+
"inspected_dataset_license = inspected_dataset.merge(license_df, on=\"license\")\n",
1567+
"inspected_dataset_license[\"parsed_title\"] = inspected_dataset_license[\n",
1568+
" \"title\"\n",
1569+
"].str.extract(r\"<title>(.*)</title>\")\n",
1570+
"inspected_dataset_license[\"content_sum\"] = (\n",
1571+
" inspected_dataset_license[\"url\"]\n",
1572+
" + inspected_dataset_license[\"parsed_title\"]\n",
1573+
" + inspected_dataset_license[\"contents\"]\n",
1574+
")\n",
1575+
"inspected_dataset_license = inspected_dataset_license.groupby(\n",
1576+
" \"content_sum\"\n",
1577+
").agg(lambda x: x.iloc[0])\n",
15511578
"inspected_dataset_license.groupby(\"General Typing\").count()"
15521579
]
15531580
}

model_sampling/dataset_sampling.py

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
"""
66

77
# Standard library
8-
import datetime as dt
98
import os
109
import sys
1110
import traceback
@@ -23,32 +22,25 @@
2322
API_KEYS = query_secrets.API_KEYS
2423
API_KEYS_IND = 0
2524
CWD = os.path.dirname(os.path.abspath(__file__))
26-
MODEL_DATABASE = (
27-
f"{CWD}"
28-
f"/model_dataset.db"
29-
)
25+
MODEL_DATABASE = f"{CWD}" f"/model_dataset.db"
3026
PSE_KEY = query_secrets.PSE_KEY
3127

3228
RIGHTS_MAP = {
3329
"by": "cc_attribute",
3430
"sa": "cc_sharealike",
3531
"nc": "cc_noncommercial",
3632
"nd": "cc_nonderived",
37-
"publicdomain": "cc_publicdomain"
33+
"publicdomain": "cc_publicdomain",
3834
}
3935

4036

4137
def get_rights(license_type):
42-
#TODO: Documentation
43-
return [
44-
RIGHTS_MAP[right]
45-
for right in RIGHTS_MAP
46-
if right in license_type
47-
]
38+
# TODO: Documentation
39+
return [RIGHTS_MAP[right] for right in RIGHTS_MAP if right in license_type]
4840

4941

5042
def get_license_map():
51-
#TODO: Documentation
43+
# TODO: Documentation
5244
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
5345
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
5446
license_pattern_map = {
@@ -58,7 +50,7 @@ def get_license_map():
5850
"by-nc-sa": "licenses/by-nc-sa/",
5951
"by-nd": "licenses/by-nd/",
6052
"by-nc-nd": "licenses/by-nc-nd/|licenses/by-nd-nc/",
61-
"publicdomain": "publicdomain/"
53+
"publicdomain": "publicdomain/",
6254
}
6355
license_list = pd.Series(
6456
cc_license_data[0]
@@ -72,16 +64,17 @@ def get_license_map():
7264
}
7365
return license_series_map
7466

67+
7568
def get_api_endpoint(license_type, license_rights, start):
76-
#TODO: Documentation
69+
# TODO: Documentation
7770
try:
7871
api_key = API_KEYS[API_KEYS_IND]
7972
base_url = (
8073
r"https://customsearch.googleapis.com/customsearch/v1"
8174
f"?key={api_key}&cx={PSE_KEY}&"
8275
f"q=-fileType%3Apdf%20-inurl%3Apdf%20-pdf&"
8376
f"start={start}&"
84-
f"m12&" #Third Layer Strictness
77+
f"m12&" # Third Layer Strictness
8578
)
8679
base_url = (
8780
f"{base_url}&linkSite=creativecommons.org"
@@ -92,19 +85,17 @@ def get_api_endpoint(license_type, license_rights, start):
9285
except Exception as e:
9386
if isinstance(e, IndexError):
9487
print(
95-
"IndexError: Depleted all API Keys provided",
96-
file=sys.stderr
88+
"IndexError: Depleted all API Keys provided", file=sys.stderr
9789
)
9890
else:
9991
raise e
10092

101-
def get_api_response(license_type, start, retry_on_empty = 2):
102-
#TODO: Documentation
93+
94+
def get_api_response(license_type, start, retry_on_empty=2):
95+
# TODO: Documentation
10396
try:
10497
request_url = get_api_endpoint(
105-
license_type,
106-
get_rights(license_type),
107-
start
98+
license_type, get_rights(license_type), start
10899
)
109100
max_retries = Retry(
110101
total=5,
@@ -121,7 +112,9 @@ def get_api_response(license_type, start, retry_on_empty = 2):
121112
except Exception as e:
122113
if isinstance(e, KeyError):
123114
if retry_on_empty:
124-
return get_api_response(license_type, start, retry_on_empty - 1)
115+
return get_api_response(
116+
license_type, start, retry_on_empty - 1
117+
)
125118
else:
126119
return {}
127120
if isinstance(e, requests.exceptions.HTTPError):
@@ -135,38 +128,37 @@ def get_api_response(license_type, start, retry_on_empty = 2):
135128
print(f"Request URL was {request_url}", file=sys.stderr)
136129
raise e
137130

131+
138132
def get_address_entries(web_url, content_char_count=5000):
139-
#TODO: Documentation
133+
# TODO: Documentation
140134
try:
141135
web_contents = requests.get(web_url).text
142136
encoding = EncodingDetector.find_declared_encoding(
143-
web_contents,
144-
is_html = True
137+
web_contents, is_html=True
145138
)
146139
soup = BeautifulSoup(web_contents, "lxml", from_encoding=encoding)
147140
for script in soup(["script", "style"]):
148141
script.extract()
149-
parse_result = soup.get_text(" ", strip = True)
142+
parse_result = soup.get_text(" ", strip=True)
150143
return (web_url, soup.title, parse_result[:content_char_count])
151-
except Exception as e:
144+
except Exception:
152145
return None
153146

147+
154148
def get_license_type_sample_df(license_type):
155-
#TODO: Documentation
149+
# TODO: Documentation
156150
license_sample_dict = {
157151
"license": [],
158152
"url": [],
159153
"title": [],
160-
"contents": []
154+
"contents": [],
161155
}
162156
for start_ind in range(1, 101, 10):
163157
license_subresponse = get_api_response(license_type, start_ind)
164158
for entry in license_subresponse:
165159
if ".pdf" in entry["link"] or ".txt" in entry["link"]:
166160
continue
167-
address_entries = get_address_entries(
168-
entry["link"]
169-
)
161+
address_entries = get_address_entries(entry["link"])
170162
if address_entries is not None:
171163
license_sample_dict["license"].append(license_type)
172164
license_sample_dict["url"].append(address_entries[0])
@@ -175,23 +167,26 @@ def get_license_type_sample_df(license_type):
175167
print(f"DEBUG: {license_type} has been sampled.")
176168
return pd.DataFrame(license_sample_dict)
177169

170+
178171
def get_license_series_sample_df(general_license_series):
179-
#TODO: Documentation
172+
# TODO: Documentation
180173
return pd.concat(
181174
[
182175
get_license_type_sample_df(license_type)
183176
for license_type in general_license_series
184177
]
185178
)
186179

180+
187181
def load_general_licenses():
188-
#TODO: Documentation
182+
# TODO: Documentation
189183
engine = sqlalchemy.create_engine(f"sqlite:///{CWD}/modeling_dataset.db")
190184
engine.connect()
191185
license_map = get_license_map()
192186
for general_type in license_map:
193187
sampled_df = get_license_series_sample_df(license_map[general_type])
194-
sampled_df.to_sql(general_type, engine, if_exists = 'append')
188+
sampled_df.to_sql(general_type, engine, if_exists="append")
189+
195190

196191
def main():
197192
load_general_licenses()

0 commit comments

Comments
 (0)