1- from __future__ import print_function
2-
3- from time import sleep
41import sys
5-
6- from BeautifulSoup import BeautifulSoup # Python 2 only, sorry.
2+ from urllib .parse import urlparse
73
84import requests
9- from streamz import Stream
105import toolz
11- import urlparse
6+ from bs4 import BeautifulSoup
127
8+ from streamz import Stream , identity
139
1410
15- def links_of_page ((content , page )):
16- uri = urlparse .urlparse (page )
11+ def links_of_page (content_page ):
12+ (content , page ) = content_page
13+ uri = urlparse (page )
1714 domain = '%s://%s' % (uri .scheme , uri .netloc )
1815 try :
19- soup = BeautifulSoup (content )
16+ soup = BeautifulSoup (content , features = "html.parser" )
2017 except :
2118 return []
2219 else :
23- links = [link .get ('href' ) for link in soup .findAll ('a' )]
20+ links = [link .get ('href' ) for link in soup .find_all ('a' )]
2421 return [domain + link
2522 for link in links
2623 if link
@@ -41,8 +38,9 @@ def topk_dict(d, k=10):
4138 .map (lambda x : x .content ))
4239links = (content .zip (pages )
4340 .map (links_of_page )
44- .concat ())
45- links .sink (source .emit )
41+ .filter (identity )
42+ .flatten ())
43+ links .connect (source )
4644
4745"""
4846from nltk.corpus import stopwords
@@ -60,8 +58,7 @@ def topk_dict(d, k=10):
6058"""
6159
6260if len (sys .argv ) > 1 :
63- source .emit (sys .argv [1 ])
64-
65-
66-
67- #
61+ try :
62+ source .emit (sys .argv [1 ])
63+ except KeyboardInterrupt :
64+ pass
0 commit comments