defdownload(url): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:',e.reason html = None return html
现在,当出现下载错误时,该函数能够捕获到异常,然后返回None。
重试下载
下载时遇到的错误经常是临时性的,比如服务器过载时返回的503 Service Unavailable错误。对于此类错误,我们可以尝试重新下载,因为这个服务器问题现在可能已经解决。不过,我们不需要对所有错误都尝试重新下载。如果服务器返回的是404 Not Found这种错误,则说明该网页目前并不存在,再次尝试同样的请求一般也不会出现不同的结果。所以,我们只需要确保download函数在发生5xx错误时重试下载即可。下面是支持重试下载功能的新版本代码。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
#!/usr/bin/python # -*- coding: UTF-8 -*-
import urllib2
defdownload(url, num_retries=2): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:', e.reason html = None if num_retries > 0: ifhasattr(e, 'code') and500 <= e.code < 600: # recursively retry 5xx HTTP errors return download(url, num_retries-1) return html
defdownload(url): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:',e.reason html = None return html
defcrawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall('<loc>(.*?)</loc>',sitemap) # download each link for link in links: html = download(link)
if __name__ == '__main__': crawl_sitemap('http://example.webscraping.com/sitemap.xml')
defdownload(url): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:',e.reason html = None return html
for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page html = download(url) if html isNone: break else: # success - can scrape the result pass
defdownload(url): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:',e.reason html = None return html
# maximum number of consecutive download errors allowed max_errors = 5 # current number of consecutive download errors num_errors = 0
for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page html = download(url) if html isNone: # received an error trying to download this webpage num_errors += 1 if num_errors == max_errors: # reached maximum number of # consecutive errors so exit break else: # success - can scrape the result # ... num_errors = 0
defdownload(url): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:', e.reason html = None return html
deflink_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex""" crawl_queue = [seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): crawl_queue.append(link)
defget_links(html): """Return a list of links from html""" # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html)
defdownload(url): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:', e.reason html = None return html
deflink_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex""" crawl_queue = [seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url,link) crawl_queue.append(link)
defget_links(html): """Return a list of links from html""" # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html)
defdownload(url): print'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print'Download error:', e.reason html = None return html
deflink_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex""" crawl_queue = [seed_url] # keep track which URL's have seen before seen = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_links(html): # check if link matches expected regex if re.match(link_regex, link): # from absolute link link = urlparse.urljoin(seed_url, link) # check if have already seen this link if link notin seen: seen.add(link) crawl_queue.append(link)
defget_links(html): """Return a list of links from html""" # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html)