《用Python写网络爬虫》读书笔记
学习《用Python写网络爬虫》,为了加深记忆,边读边做笔记。如有侵权,立即删除。
网络爬虫简介
背景调研
识别网站所用技术
使用builtwith模块,可以检查网站构建的技术类型。安装方法如下:
1
pip install builtwith
1
2import builtwith
'http://example.webscraping.com') builtwith.parse(1
pip install python-whois
1
2import whois
print whois.whois('appspot.com')1
2
3
4
5
6
7
8
9
10
11
12
13
14#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib2
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html = None
return html
重试下载
下载时遇到的错误经常是临时性的,比如服务器过载时返回的503 Service
Unavailable错误。对于此类错误,我们可以尝试重新下载,因为这个服务器问题现在可能已经解决。不过,我们不需要对所有错误都尝试重新下载。如果服务器返回的是404
Not
Found这种错误,则说明该网页目前并不存在,再次尝试同样的请求一般也不会出现不同的结果。所以,我们只需要确保download函数在发生5xx错误时重试下载即可。下面是支持重试下载功能的新版本代码。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib2
def download(url, num_retries=2):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries-1)
return html1
'http://httpstat.us/500') download(
设置用户代理
下面的代码对download函数进行了修改,设定了一个默认的用户代理“wswp”(即Web
Scraping with Python的首字母缩写) 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib2
def download(url, user_agent='wswp', num_retries=2):
print 'Downloading:', url
headers = {'User-agent': user_agent}
request = urllib2.Request(url, headers=headers)
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, user_agent, num_retries - 1)
return html
download('http://www.meetup.com/')1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib2
import re
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html = None
return html
def crawl_sitemap(url):
# download the sitemap file
sitemap = download(url)
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>',sitemap)
# download each link
for link in links:
html = download(link)
if __name__ == '__main__':
crawl_sitemap('http://example.webscraping.com/sitemap.xml')
ID遍历爬虫
分析实例url - http://example.webscraping.com/view/Afghanistan-1 - http://example.webscraping.com/view/Aland-Islands-2 - http://example.webscraping.com/view/Albania-3
可以看出,这些URL只在结尾处有所区别,包括国家名(作为页面别名)和ID。在URL中包含页面别名是非常普遍的做法,可以对搜索引擎优化起到帮助作用。一般情况下,Web服务器会忽略这个字符串,只使用ID来匹配数据库中的相关记录。下面我们将别名移除,加载http://example.webscraping.com/view/1
,测试连接仍然可用。
因此,我们可以忽略页面别名,只遍历ID来下载所有国家的页面。 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib2
import itertools
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html = None
return html
for page in itertools.count(1):
url = 'http://example.webscraping.com/view/-%d' % page
html = download(url)
if html is None:
break
else:
# success - can scrape the result
pass1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib2
import itertools
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html = None
return html
# maximum number of consecutive download errors allowed
max_errors = 5
# current number of consecutive download errors
num_errors = 0
for page in itertools.count(1):
url = 'http://example.webscraping.com/view/-%d' % page
html = download(url)
if html is None:
# received an error trying to download this webpage
num_errors += 1
if num_errors == max_errors:
# reached maximum number of
# consecutive errors so exit
break
else:
# success - can scrape the result
# ...
num_errors = 0
国家页链接格式如下: - http://example.webscraping.com/view/Afghanistan-1 - http://example.webscraping.com/view/Aland-Islands-2
因此,我们可以用/(index|view)
这个简单的正则表达式来匹配这两类网页。可以使用如下代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import urllib2
import urlparse
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
return html
def link_crawler(seed_url, link_regex):
"""Crawl from the given seed URL following links matched by link_regex"""
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
# filter for links matching our regular expression
for link in get_links(html):
if re.match(link_regex, link):
crawl_queue.append(link)
def get_links(html):
"""Return a list of links from html"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
link_crawler('http://example.webscraping.com', '/(index|view)')1
2
3
4
5
6ssh://root@192.168.1.122:22/usr/bin/python -u /root/pyFile/test.py
Downloading: http://example.webscraping.com
Downloading: /index/1
Traceback (most recent call last):
...
ValueError: unknown url type: /index/11
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import urllib2
import urlparse
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
return html
def link_crawler(seed_url, link_regex):
"""Crawl from the given seed URL following links matched by link_regex"""
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
# filter for links matching our regular expression
for link in get_links(html):
if re.match(link_regex, link):
link = urlparse.urljoin(seed_url,link)
crawl_queue.append(link)
def get_links(html):
"""Return a list of links from html"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
link_crawler('http://example.webscraping.com', '/(index|view)')1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import urllib2
import urlparse
def download(url):
print 'Downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
return html
def link_crawler(seed_url, link_regex):
"""Crawl from the given seed URL following links matched by link_regex"""
crawl_queue = [seed_url]
# keep track which URL's have seen before
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
# check if link matches expected regex
if re.match(link_regex, link):
# from absolute link
link = urlparse.urljoin(seed_url, link)
# check if have already seen this link
if link not in seen:
seen.add(link)
crawl_queue.append(link)
def get_links(html):
"""Return a list of links from html"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
link_crawler('http://example.webscraping.com', '/(index|view)')1
2
3
4
5
6
7
8
9
10
11
12
13
14#!/usr/bin/python
# -*- coding: UTF-8 -*-
import robotparser
rp = robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()
url = 'http://example.webscraping.com'
user_agent = 'BadCrawler'
print rp.can_fetch(user_agent, url)
user_agent = 'GoodCrawler'
print rp.can_fetch(user_agent, url)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18def download(url, user_agent='wswp', proxy=None, num_retries=2):
print 'Downloading:', url
headers = {'User-agent': user_agent}
request = urllib2.Request(url, headers=headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
html = opener.urlopen(request).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <- e.code < 600:
# retry 5XX HTTP errors
html = download(url, user_agent, proxy, num_retries-1)
return html1