Python 爬取去哪儿网商品数据


爬取去哪儿网自由行商品数据并保存在 mongodb 数据库中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
import urllib.request
import time
import pymongo

# 建立数据库连接并创建数据库和表
client = pymongo.MongoClient('192.168.1.123', 27000)
book_qunar = client['qunar']
sheet_qunar_zyx = book_qunar['qunar_zyx']

# 获取产品列表信息
def get_list(dep, item):
url = 'https://touch.dujia.qunar.com/list?modules=list,bookingInfo,activityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery={}&width=480&height=320&quality=90&limit=0,20&includeAD=true&qsact=search'.format(
urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item))
time.sleep(1)
strhtml = requests.get(url)
# 取出产品数
try:
routeCount = int(strhtml.json()['data']['limit']['routeCount'])
except:
return
for limit in range(0, routeCount, 20):
url = 'https://touch.dujia.qunar.com/list?modules=list,bookingInfo,activityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery={}&width=480&height=320&quality=90&limit={},20&includeAD=true&qsact=search'.format(
urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item), limit)
time.sleep(1)
strhtml = requests.get(url)
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': strhtml.json()
}
sheet_qunar_zyx.insert_one(result)


def get_json(url):
strhtml = requests.get(url)
time.sleep(1)
return strhtml.json()


if __name__ == "__main__":
# 获取出发地站点
url = 'https://touch.dujia.qunar.com/depCities.qunar'
dep_dict = get_json(url)
for dep_item in dep_dict['data']:
for dep in dep_dict['data'][dep_item]:
a = []
# 根据出发地站点获取目的地
url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(
urllib.request.quote(dep))
arrive_dict = get_json(url)
for arrive_item in arrive_dict['data']:
for arrive_item_1 in arrive_item['subModules']:
for query in arrive_item_1['items']:
# 对目的地进行去重
if query['query'] not in a:
a.append(query['query'])
# 获取产品列表
for item in a:
get_list(dep, item)

数据库监控程序

1
2
3
4
5
6
from get_depart import sheet_qunar_zyx
import time

while True:
print(sheet_qunar_zyx.find().count())
time.sleep(10)

参考文档:
Python 3爬虫、数据清洗与可视化实战