|
179 | 179 | ], |
180 | 180 | "source": [ |
181 | 181 | "web_contents = requests.get(example_response_df.loc[0, \"link\"]).text\n", |
182 | | - "encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n", |
| 182 | + "encoding = EncodingDetector.find_declared_encoding(web_contents, is_html=True)\n", |
183 | 183 | "soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n", |
184 | 184 | "for script in soup([\"script\", \"style\"]):\n", |
185 | 185 | " script.extract()\n", |
186 | | - "parse_result = soup.get_text(\"\", strip = True)\n", |
| 186 | + "parse_result = soup.get_text(\"\", strip=True)\n", |
187 | 187 | "print(f\"title: {example_response_df.loc[0, 'title']}\")\n", |
188 | 188 | "print(f\"url: {example_response_df.loc[0, 'link']}\")\n", |
189 | 189 | "print(f\"contents\\n: {parse_result[:500]}\")" |
|
224 | 224 | "source": [ |
225 | 225 | "for i in example_response_df.index:\n", |
226 | 226 | " web_contents = requests.get(example_response_df.loc[i, \"link\"]).text\n", |
227 | | - " encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n", |
| 227 | + " encoding = EncodingDetector.find_declared_encoding(\n", |
| 228 | + " web_contents, is_html=True\n", |
| 229 | + " )\n", |
228 | 230 | " soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n", |
229 | 231 | " for script in soup([\"script\", \"style\"]):\n", |
230 | 232 | " script.extract()\n", |
231 | | - " parse_result = soup.get_text(\"\", strip = True)\n", |
| 233 | + " parse_result = soup.get_text(\"\", strip=True)\n", |
232 | 234 | " print(f\"entry {i}\")\n", |
233 | 235 | " print(f\"title: {soup.title}\")\n", |
234 | | - " '''print(f\"url: {example_response_df.loc[i, 'link']}\")\n", |
235 | | - " print(f\"contents\\n: {parse_result[:500]}\")'''" |
| 236 | + " \"\"\"print(f\"url: {example_response_df.loc[i, 'link']}\")\n", |
| 237 | + " print(f\"contents\\n: {parse_result[:500]}\")\"\"\"" |
236 | 238 | ] |
237 | 239 | }, |
238 | 240 | { |
|
373 | 375 | "sample_contents = []\n", |
374 | 376 | "for address in sample_df[\"link\"]:\n", |
375 | 377 | " web_contents = requests.get(address).text\n", |
376 | | - " encoding = EncodingDetector.find_declared_encoding(web_contents, is_html = True)\n", |
| 378 | + " encoding = EncodingDetector.find_declared_encoding(\n", |
| 379 | + " web_contents, is_html=True\n", |
| 380 | + " )\n", |
377 | 381 | " soup = BeautifulSoup(web_contents, \"lxml\", from_encoding=encoding)\n", |
378 | 382 | " for script in soup([\"script\", \"style\"]):\n", |
379 | 383 | " script.extract()\n", |
380 | | - " parse_result = soup.get_text(\"\", strip = True)\n", |
| 384 | + " parse_result = soup.get_text(\"\", strip=True)\n", |
381 | 385 | " sample_titles.append(soup.title)\n", |
382 | 386 | " sample_contents.append(parse_result[:3000])\n", |
383 | 387 | "sample_df[\"title\"] = sample_titles\n", |
|
411 | 415 | } |
412 | 416 | ], |
413 | 417 | "source": [ |
414 | | - "sample_df.to_sql(\"example\", engine, if_exists = 'append')" |
| 418 | + "sample_df.to_sql(\"example\", engine, if_exists=\"append\")" |
415 | 419 | ] |
416 | 420 | }, |
417 | 421 | { |
|
939 | 943 | " \"by-nc-sa\": \"licenses/by-nc-sa/\",\n", |
940 | 944 | " \"by-nd\": \"licenses/by-nd/\",\n", |
941 | 945 | " \"by-nc-nd\": \"licenses/by-nc-nd/|licenses/by-nd-nc/\",\n", |
942 | | - " \"publicdomain\": \"publicdomain/\"\n", |
| 946 | + " \"publicdomain\": \"publicdomain/\",\n", |
943 | 947 | "}\n", |
944 | 948 | "license_list = pd.Series(\n", |
945 | 949 | " cc_license_data[0]\n", |
|
961 | 965 | "outputs": [], |
962 | 966 | "source": [ |
963 | 967 | "import dataset_sampling\n", |
| 968 | + "\n", |
964 | 969 | "license_map = dataset_sampling.get_license_map()\n", |
965 | 970 | "license_ser = pd.concat([v for v in license_map.values()])\n", |
966 | 971 | "license_ser_splits_df = license_ser.str.split(\"/\", expand=True)\n", |
967 | 972 | "license_ser_splits_df = license_ser_splits_df.rename(\n", |
968 | | - " columns = {\n", |
| 973 | + " columns={\n", |
969 | 974 | " 0: \"Tool Typing\",\n", |
970 | 975 | " 1: \"General Typing\",\n", |
971 | 976 | " 2: \"Version\",\n", |
972 | | - " 3: \"Jurisdiction\"\n", |
| 977 | + " 3: \"Jurisdiction\",\n", |
973 | 978 | " }\n", |
974 | 979 | ")\n", |
975 | | - "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\"General Typing\"].str.replace(\"mark|zero\", \"publicdomain\", regex=True)\n", |
976 | | - "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\"General Typing\"].str.replace(\"by-nd-nc\", \"by-nc-nd\", regex=True)\n", |
977 | | - "license_ser_splits_df[\"Version\"] = license_ser_splits_df[\"Version\"].astype(float)" |
| 980 | + "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\n", |
| 981 | + " \"General Typing\"\n", |
| 982 | + "].str.replace(\"mark|zero\", \"publicdomain\", regex=True)\n", |
| 983 | + "license_ser_splits_df[\"General Typing\"] = license_ser_splits_df[\n", |
| 984 | + " \"General Typing\"\n", |
| 985 | + "].str.replace(\"by-nd-nc\", \"by-nc-nd\", regex=True)\n", |
| 986 | + "license_ser_splits_df[\"Version\"] = license_ser_splits_df[\"Version\"].astype(\n", |
| 987 | + " float\n", |
| 988 | + ")" |
978 | 989 | ] |
979 | 990 | }, |
980 | 991 | { |
|
984 | 995 | "outputs": [], |
985 | 996 | "source": [ |
986 | 997 | "license_one_hot_encoding = pd.DataFrame()\n", |
987 | | - "license_one_hot_encoding[\"by\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"by\")\n", |
988 | | - "license_one_hot_encoding[\"sa\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"sa\")\n", |
989 | | - "license_one_hot_encoding[\"nc\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"nc\")\n", |
990 | | - "license_one_hot_encoding[\"nd\"] = license_ser_splits_df[\"General Typing\"].str.contains(\"nd\")\n", |
991 | | - "license_not_six_type = license_ser_splits_df[\"General Typing\"].str.contains(\"by|sa|nc|nd\")\n", |
| 998 | + "license_one_hot_encoding[\"by\"] = license_ser_splits_df[\n", |
| 999 | + " \"General Typing\"\n", |
| 1000 | + "].str.contains(\"by\")\n", |
| 1001 | + "license_one_hot_encoding[\"sa\"] = license_ser_splits_df[\n", |
| 1002 | + " \"General Typing\"\n", |
| 1003 | + "].str.contains(\"sa\")\n", |
| 1004 | + "license_one_hot_encoding[\"nc\"] = license_ser_splits_df[\n", |
| 1005 | + " \"General Typing\"\n", |
| 1006 | + "].str.contains(\"nc\")\n", |
| 1007 | + "license_one_hot_encoding[\"nd\"] = license_ser_splits_df[\n", |
| 1008 | + " \"General Typing\"\n", |
| 1009 | + "].str.contains(\"nd\")\n", |
| 1010 | + "license_not_six_type = license_ser_splits_df[\"General Typing\"].str.contains(\n", |
| 1011 | + " \"by|sa|nc|nd\"\n", |
| 1012 | + ")\n", |
992 | 1013 | "license_one_hot_encoding[\"neither\"] = ~(license_not_six_type.fillna(False))" |
993 | 1014 | ] |
994 | 1015 | }, |
|
998 | 1019 | "metadata": {}, |
999 | 1020 | "outputs": [], |
1000 | 1021 | "source": [ |
1001 | | - "license_df = pd.concat([license_ser, license_ser_splits_df, license_one_hot_encoding], axis = 1)\\\n", |
1002 | | - " .rename(columns = {0: \"license\"})" |
| 1022 | + "license_df = pd.concat(\n", |
| 1023 | + " [license_ser, license_ser_splits_df, license_one_hot_encoding], axis=1\n", |
| 1024 | + ").rename(columns={0: \"license\"})" |
1003 | 1025 | ] |
1004 | 1026 | }, |
1005 | 1027 | { |
|
1019 | 1041 | } |
1020 | 1042 | ], |
1021 | 1043 | "source": [ |
1022 | | - "sampling_engine = sqlalchemy.create_engine(f\"sqlite:///{CWD}/modeling_dataset.db\")\n", |
| 1044 | + "sampling_engine = sqlalchemy.create_engine(\n", |
| 1045 | + " f\"sqlite:///{CWD}/modeling_dataset.db\"\n", |
| 1046 | + ")\n", |
1023 | 1047 | "sampling_engine.connect()" |
1024 | 1048 | ] |
1025 | 1049 | }, |
|
1334 | 1358 | } |
1335 | 1359 | ], |
1336 | 1360 | "source": [ |
1337 | | - "tables = pd.read_sql(\"SELECT * FROM sqlite_master WHERE type='table'\", sampling_engine)\n", |
| 1361 | + "tables = pd.read_sql(\n", |
| 1362 | + " \"SELECT * FROM sqlite_master WHERE type='table'\", sampling_engine\n", |
| 1363 | + ")\n", |
1338 | 1364 | "inspected_dataset = pd.concat(\n", |
1339 | 1365 | " [\n", |
1340 | | - " pd.read_sql(\n", |
1341 | | - " f\"SELECT * FROM '{license_type}'\",\n", |
1342 | | - " sampling_engine\n", |
1343 | | - " )\n", |
| 1366 | + " pd.read_sql(f\"SELECT * FROM '{license_type}'\", sampling_engine)\n", |
1344 | 1367 | " for license_type in tables[\"name\"]\n", |
1345 | 1368 | " ]\n", |
1346 | 1369 | ")\n", |
|
1540 | 1563 | } |
1541 | 1564 | ], |
1542 | 1565 | "source": [ |
1543 | | - "inspected_dataset_license = inspected_dataset.merge(license_df, on = \"license\")\n", |
1544 | | - "inspected_dataset_license[\"parsed_title\"] = \\\n", |
1545 | | - " inspected_dataset_license[\"title\"].str.extract(r\"<title>(.*)</title>\")\n", |
1546 | | - "inspected_dataset_license[\"content_sum\"] = \\\n", |
1547 | | - " inspected_dataset_license[\"url\"] + \\\n", |
1548 | | - " inspected_dataset_license[\"parsed_title\"] + \\\n", |
1549 | | - " inspected_dataset_license[\"contents\"]\n", |
1550 | | - "inspected_dataset_license = inspected_dataset_license.groupby(\"content_sum\").agg(lambda x: x.iloc[0])\n", |
| 1566 | + "inspected_dataset_license = inspected_dataset.merge(license_df, on=\"license\")\n", |
| 1567 | + "inspected_dataset_license[\"parsed_title\"] = inspected_dataset_license[\n", |
| 1568 | + " \"title\"\n", |
| 1569 | + "].str.extract(r\"<title>(.*)</title>\")\n", |
| 1570 | + "inspected_dataset_license[\"content_sum\"] = (\n", |
| 1571 | + " inspected_dataset_license[\"url\"]\n", |
| 1572 | + " + inspected_dataset_license[\"parsed_title\"]\n", |
| 1573 | + " + inspected_dataset_license[\"contents\"]\n", |
| 1574 | + ")\n", |
| 1575 | + "inspected_dataset_license = inspected_dataset_license.groupby(\n", |
| 1576 | + " \"content_sum\"\n", |
| 1577 | + ").agg(lambda x: x.iloc[0])\n", |
1551 | 1578 | "inspected_dataset_license.groupby(\"General Typing\").count()" |
1552 | 1579 | ] |
1553 | 1580 | } |
|
0 commit comments