Skip to content

Commit 860ec25

Browse files
committed
Refresh URL scraper example
Using BeautifulSoup4 and making sure to filter out empty lists before feeding them flatten, this runs again.
1 parent f1b6660 commit 860ec25

File tree

1 file changed

+15
-18
lines changed

1 file changed

+15
-18
lines changed

examples/scrape.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,23 @@
1-
from __future__ import print_function
2-
3-
from time import sleep
41
import sys
5-
6-
from BeautifulSoup import BeautifulSoup # Python 2 only, sorry.
2+
from urllib.parse import urlparse
73

84
import requests
9-
from streamz import Stream
105
import toolz
11-
import urlparse
6+
from bs4 import BeautifulSoup
127

8+
from streamz import Stream, identity
139

1410

15-
def links_of_page((content, page)):
16-
uri = urlparse.urlparse(page)
11+
def links_of_page(content_page):
12+
(content, page) = content_page
13+
uri = urlparse(page)
1714
domain = '%s://%s' % (uri.scheme, uri.netloc)
1815
try:
19-
soup = BeautifulSoup(content)
16+
soup = BeautifulSoup(content, features="html.parser")
2017
except:
2118
return []
2219
else:
23-
links = [link.get('href') for link in soup.findAll('a')]
20+
links = [link.get('href') for link in soup.find_all('a')]
2421
return [domain + link
2522
for link in links
2623
if link
@@ -41,8 +38,9 @@ def topk_dict(d, k=10):
4138
.map(lambda x: x.content))
4239
links = (content.zip(pages)
4340
.map(links_of_page)
44-
.concat())
45-
links.sink(source.emit)
41+
.filter(identity)
42+
.flatten())
43+
links.connect(source)
4644

4745
"""
4846
from nltk.corpus import stopwords
@@ -60,8 +58,7 @@ def topk_dict(d, k=10):
6058
"""
6159

6260
if len(sys.argv) > 1:
63-
source.emit(sys.argv[1])
64-
65-
66-
67-
#
61+
try:
62+
source.emit(sys.argv[1])
63+
except KeyboardInterrupt:
64+
pass

0 commit comments

Comments
 (0)