update per isort, black, and flake8

TimidRobot · TimidRobot · commit e72f9a64655d · 2023-01-28T08:17:19.000-08:00
diff --git a/model_sampling/dataset_sampling.ipynb b/model_sampling/dataset_sampling.ipynb
@@ -179,11 +179,11 @@
    ],
    "source": [
     "web_contents = requests.get(example_response_df.loc[0, \"link\"]).text\n",
-    "encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n",
+    "encoding = EncodingDetector.find_declared_encoding(web_contents, is_html=True)\n",
     "soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n",
     "for script in soup([\"script\", \"style\"]):\n",
     "    script.extract()\n",
-    "parse_result = soup.get_text(\"\", strip = True)\n",
+    "parse_result = soup.get_text(\"\", strip=True)\n",
     "print(f\"title: {example_response_df.loc[0, 'title']}\")\n",
     "print(f\"url: {example_response_df.loc[0, 'link']}\")\n",
     "print(f\"contents\\n: {parse_result[:500]}\")"
@@ -224,15 +224,17 @@
    "source": [
     "for i in example_response_df.index:\n",
     "    web_contents = requests.get(example_response_df.loc[i, \"link\"]).text\n",
-    "    encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n",
+    "    encoding = EncodingDetector.find_declared_encoding(\n",
+    "        web_contents, is_html=True\n",
+    "    )\n",
     "    soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n",
     "    for script in soup([\"script\", \"style\"]):\n",
     "        script.extract()\n",
-    "    parse_result = soup.get_text(\"\", strip = True)\n",
+    "    parse_result = soup.get_text(\"\", strip=True)\n",
     "    print(f\"entry {i}\")\n",
     "    print(f\"title: {soup.title}\")\n",
-    "    '''print(f\"url: {example_response_df.loc[i, 'link']}\")\n",
-    "    print(f\"contents\\n: {parse_result[:500]}\")'''"
+    "    \"\"\"print(f\"url: {example_response_df.loc[i, 'link']}\")\n",
+    "    print(f\"contents\\n: {parse_result[:500]}\")\"\"\""
    ]
   },
   {
@@ -373,11 +375,13 @@
     "sample_contents = []\n",
     "for address in sample_df[\"link\"]:\n",
     "    web_contents = requests.get(address).text\n",
-    "    encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n",
+    "    encoding = EncodingDetector.find_declared_encoding(\n",
+    "        web_contents, is_html=True\n",
+    "    )\n",
     "    soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n",
     "    for script in soup([\"script\", \"style\"]):\n",
     "        script.extract()\n",
-    "    parse_result = soup.get_text(\"\", strip = True)\n",
+    "    parse_result = soup.get_text(\"\", strip=True)\n",
     "    sample_titles.append(soup.title)\n",
     "    sample_contents.append(parse_result[:3000])\n",
     "sample_df[\"title\"] = sample_titles\n",
@@ -411,7 +415,7 @@
     }
    ],
    "source": [
-    "sample_df.to_sql(\"example\", engine, if_exists = 'append')"
+    "sample_df.to_sql(\"example\", engine, if_exists=\"append\")"
    ]
   },
   {
@@ -939,7 +943,7 @@
     "    \"by-nc-sa\": \"licenses/by-nc-sa/\",\n",
     "    \"by-nd\": \"licenses/by-nd/\",\n",
     "    \"by-nc-nd\": \"licenses/by-nc-nd/|licenses/by-nd-nc/\",\n",
-    "    \"publicdomain\": \"publicdomain/\"\n",
+    "    \"publicdomain\": \"publicdomain/\",\n",
     "}\n",
     "license_list = pd.Series(\n",
     "    cc_license_data[0]\n",
@@ -961,20 +965,27 @@
    "outputs": [],
    "source": [
     "import dataset_sampling\n",
+    "\n",
     "license_map = dataset_sampling.get_license_map()\n",
     "license_ser = pd.concat([v for v in license_map.values()])\n",
     "license_ser_splits_df = license_ser.str.split(\"/\", expand=True)\n",
     "license_ser_splits_df = license_ser_splits_df.rename(\n",
-    "    columns = {\n",
+    "    columns={\n",
     "        0: \"Tool Typing\",\n",
     "        1: \"General Typing\",\n",
     "        2: \"Version\",\n",
-    "        3: \"Jurisdiction\"\n",
+    "        3: \"Jurisdiction\",\n",
     "    }\n",
     ")\n",
-    "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\"General Typing\"].str.replace(\"mark|zero\", \"publicdomain\", regex=True)\n",
-    "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\"General Typing\"].str.replace(\"by-nd-nc\", \"by-nc-nd\", regex=True)\n",
-    "license_ser_splits_df[\"Version\"] = license_ser_splits_df[\"Version\"].astype(float)"
+    "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\n",
+    "    \"General Typing\"\n",
+    "].str.replace(\"mark|zero\", \"publicdomain\", regex=True)\n",
+    "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\n",
+    "    \"General Typing\"\n",
+    "].str.replace(\"by-nd-nc\", \"by-nc-nd\", regex=True)\n",
+    "license_ser_splits_df[\"Version\"] = license_ser_splits_df[\"Version\"].astype(\n",
+    "    float\n",
+    ")"
    ]
   },
   {
@@ -984,11 +995,21 @@
    "outputs": [],
    "source": [
     "license_one_hot_encoding = pd.DataFrame()\n",
-    "license_one_hot_encoding[\"by\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"by\")\n",
-    "license_one_hot_encoding[\"sa\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"sa\")\n",
-    "license_one_hot_encoding[\"nc\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"nc\")\n",
-    "license_one_hot_encoding[\"nd\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"nd\")\n",
-    "license_not_six_type = license_ser_splits_df[\"General Typing\"].str.contains(\"by|sa|nc|nd\")\n",
+    "license_one_hot_encoding[\"by\"] = license_ser_splits_df[\n",
+    "    \"General Typing\"\n",
+    "].str.contains(\"by\")\n",
+    "license_one_hot_encoding[\"sa\"] = license_ser_splits_df[\n",
+    "    \"General Typing\"\n",
+    "].str.contains(\"sa\")\n",
+    "license_one_hot_encoding[\"nc\"] = license_ser_splits_df[\n",
+    "    \"General Typing\"\n",
+    "].str.contains(\"nc\")\n",
+    "license_one_hot_encoding[\"nd\"] = license_ser_splits_df[\n",
+    "    \"General Typing\"\n",
+    "].str.contains(\"nd\")\n",
+    "license_not_six_type = license_ser_splits_df[\"General Typing\"].str.contains(\n",
+    "    \"by|sa|nc|nd\"\n",
+    ")\n",
     "license_one_hot_encoding[\"neither\"] = ~(license_not_six_type.fillna(False))"
    ]
   },
@@ -998,8 +1019,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "license_df = pd.concat([license_ser, license_ser_splits_df, license_one_hot_encoding], axis = 1)\\\n",
-    "    .rename(columns = {0: \"license\"})"
+    "license_df = pd.concat(\n",
+    "    [license_ser, license_ser_splits_df, license_one_hot_encoding], axis=1\n",
+    ").rename(columns={0: \"license\"})"
    ]
   },
   {
@@ -1019,7 +1041,9 @@
     }
    ],
    "source": [
-    "sampling_engine = sqlalchemy.create_engine(f\"sqlite:///{CWD}/modeling_dataset.db\")\n",
+    "sampling_engine = sqlalchemy.create_engine(\n",
+    "    f\"sqlite:///{CWD}/modeling_dataset.db\"\n",
+    ")\n",
     "sampling_engine.connect()"
    ]
   },
@@ -1334,13 +1358,12 @@
     }
    ],
    "source": [
-    "tables = pd.read_sql(\"SELECT * FROM sqlite_master WHERE type='table'\", sampling_engine)\n",
+    "tables = pd.read_sql(\n",
+    "    \"SELECT * FROM sqlite_master WHERE type='table'\", sampling_engine\n",
+    ")\n",
     "inspected_dataset = pd.concat(\n",
     "    [\n",
-    "        pd.read_sql(\n",
-    "            f\"SELECT * FROM '{license_type}'\",\n",
-    "            sampling_engine\n",
-    "        )\n",
+    "        pd.read_sql(f\"SELECT * FROM '{license_type}'\", sampling_engine)\n",
     "        for license_type in tables[\"name\"]\n",
     "    ]\n",
     ")\n",
@@ -1540,14 +1563,18 @@
     }
    ],
    "source": [
-    "inspected_dataset_license = inspected_dataset.merge(license_df, on = \"license\")\n",
-    "inspected_dataset_license[\"parsed_title\"] = \\\n",
-    "    inspected_dataset_license[\"title\"].str.extract(r\"<title>(.*)</title>\")\n",
-    "inspected_dataset_license[\"content_sum\"] = \\\n",
-    "    inspected_dataset_license[\"url\"] + \\\n",
-    "    inspected_dataset_license[\"parsed_title\"] + \\\n",
-    "    inspected_dataset_license[\"contents\"]\n",
-    "inspected_dataset_license = inspected_dataset_license.groupby(\"content_sum\").agg(lambda x: x.iloc[0])\n",
+    "inspected_dataset_license = inspected_dataset.merge(license_df, on=\"license\")\n",
+    "inspected_dataset_license[\"parsed_title\"] = inspected_dataset_license[\n",
+    "    \"title\"\n",
+    "].str.extract(r\"<title>(.*)</title>\")\n",
+    "inspected_dataset_license[\"content_sum\"] = (\n",
+    "    inspected_dataset_license[\"url\"]\n",
+    "    + inspected_dataset_license[\"parsed_title\"]\n",
+    "    + inspected_dataset_license[\"contents\"]\n",
+    ")\n",
+    "inspected_dataset_license = inspected_dataset_license.groupby(\n",
+    "    \"content_sum\"\n",
+    ").agg(lambda x: x.iloc[0])\n",
     "inspected_dataset_license.groupby(\"General Typing\").count()"
    ]
   }
diff --git a/model_sampling/dataset_sampling.py b/model_sampling/dataset_sampling.py
@@ -5,7 +5,6 @@
 """
 
 # Standard library
-import datetime as dt
 import os
 import sys
 import traceback
@@ -23,32 +22,25 @@
 API_KEYS = query_secrets.API_KEYS
 API_KEYS_IND = 0
 CWD = os.path.dirname(os.path.abspath(__file__))
-MODEL_DATABASE = (
-    f"{CWD}"
-    f"/model_dataset.db"
-)
+MODEL_DATABASE = f"{CWD}" f"/model_dataset.db"
 PSE_KEY = query_secrets.PSE_KEY
 
 RIGHTS_MAP = {
     "by": "cc_attribute",
     "sa": "cc_sharealike",
     "nc": "cc_noncommercial",
     "nd": "cc_nonderived",
-    "publicdomain": "cc_publicdomain"
+    "publicdomain": "cc_publicdomain",
 }
 
 
 def get_rights(license_type):
-    #TODO: Documentation
-    return [
-        RIGHTS_MAP[right]
-        for right in RIGHTS_MAP
-        if right in license_type
-    ]
+    # TODO: Documentation
+    return [RIGHTS_MAP[right] for right in RIGHTS_MAP if right in license_type]
 
 
 def get_license_map():
-    #TODO: Documentation
+    # TODO: Documentation
     cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
     license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
     license_pattern_map = {
@@ -58,7 +50,7 @@ def get_license_map():
         "by-nc-sa": "licenses/by-nc-sa/",
         "by-nd": "licenses/by-nd/",
         "by-nc-nd": "licenses/by-nc-nd/|licenses/by-nd-nc/",
-        "publicdomain": "publicdomain/"
+        "publicdomain": "publicdomain/",
     }
     license_list = pd.Series(
         cc_license_data[0]
@@ -72,16 +64,17 @@ def get_license_map():
     }
     return license_series_map
 
+
 def get_api_endpoint(license_type, license_rights, start):
-    #TODO: Documentation
+    # TODO: Documentation
     try:
         api_key = API_KEYS[API_KEYS_IND]
         base_url = (
             r"https://customsearch.googleapis.com/customsearch/v1"
             f"?key={api_key}&cx={PSE_KEY}&"
             f"q=-fileType%3Apdf%20-inurl%3Apdf%20-pdf&"
             f"start={start}&"
-            f"m12&" #Third Layer Strictness
+            f"m12&"  # Third Layer Strictness
         )
         base_url = (
             f"{base_url}&linkSite=creativecommons.org"
@@ -92,19 +85,17 @@ def get_api_endpoint(license_type, license_rights, start):
     except Exception as e:
         if isinstance(e, IndexError):
             print(
-                "IndexError: Depleted all API Keys provided",
-                file=sys.stderr
+                "IndexError: Depleted all API Keys provided", file=sys.stderr
             )
         else:
             raise e
 
-def get_api_response(license_type, start, retry_on_empty = 2):
-    #TODO: Documentation
+
+def get_api_response(license_type, start, retry_on_empty=2):
+    # TODO: Documentation
     try:
         request_url = get_api_endpoint(
-            license_type,
-            get_rights(license_type),
-            start
+            license_type, get_rights(license_type), start
         )
         max_retries = Retry(
             total=5,
@@ -121,7 +112,9 @@ def get_api_response(license_type, start, retry_on_empty = 2):
     except Exception as e:
         if isinstance(e, KeyError):
             if retry_on_empty:
-                return get_api_response(license_type, start, retry_on_empty - 1)
+                return get_api_response(
+                    license_type, start, retry_on_empty - 1
+                )
             else:
                 return {}
         if isinstance(e, requests.exceptions.HTTPError):
@@ -135,38 +128,37 @@ def get_api_response(license_type, start, retry_on_empty = 2):
             print(f"Request URL was {request_url}", file=sys.stderr)
             raise e
 
+
 def get_address_entries(web_url, content_char_count=5000):
-    #TODO: Documentation
+    # TODO: Documentation
     try:
         web_contents = requests.get(web_url).text
         encoding = EncodingDetector.find_declared_encoding(
-            web_contents,
-            is_html = True
+            web_contents, is_html=True
         )
         soup = BeautifulSoup(web_contents, "lxml", from_encoding=encoding)
         for script in soup(["script", "style"]):
             script.extract()
-        parse_result = soup.get_text(" ", strip = True)
+        parse_result = soup.get_text(" ", strip=True)
         return (web_url, soup.title, parse_result[:content_char_count])
-    except Exception as e:
+    except Exception:
         return None
 
+
 def get_license_type_sample_df(license_type):
-    #TODO: Documentation
+    # TODO: Documentation
     license_sample_dict = {
         "license": [],
         "url": [],
         "title": [],
-        "contents": []
+        "contents": [],
     }
     for start_ind in range(1, 101, 10):
         license_subresponse = get_api_response(license_type, start_ind)
         for entry in license_subresponse:
             if ".pdf" in entry["link"] or ".txt" in entry["link"]:
                 continue
-            address_entries = get_address_entries(
-                entry["link"]
-            )
+            address_entries = get_address_entries(entry["link"])
             if address_entries is not None:
                 license_sample_dict["license"].append(license_type)
                 license_sample_dict["url"].append(address_entries[0])
@@ -175,23 +167,26 @@ def get_license_type_sample_df(license_type):
     print(f"DEBUG: {license_type} has been sampled.")
     return pd.DataFrame(license_sample_dict)
 
+
 def get_license_series_sample_df(general_license_series):
-    #TODO: Documentation
+    # TODO: Documentation
     return pd.concat(
         [
             get_license_type_sample_df(license_type)
             for license_type in general_license_series
         ]
     )
 
+
 def load_general_licenses():
-    #TODO: Documentation
+    # TODO: Documentation
     engine = sqlalchemy.create_engine(f"sqlite:///{CWD}/modeling_dataset.db")
     engine.connect()
     license_map = get_license_map()
     for general_type in license_map:
         sampled_df = get_license_series_sample_df(license_map[general_type])
-        sampled_df.to_sql(general_type, engine, if_exists = 'append')
+        sampled_df.to_sql(general_type, engine, if_exists="append")
+
 
 def main():
     load_general_licenses()
diff --git a/model_sampling/model_dev.ipynb b/model_sampling/model_dev.ipynb