1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
|
import cookielib import urllib2 import re from mysql_operation import MySQL
class Book: """保存图书的详细信息"""
def __init__(self, book_name, book_author, book_publisher, book_publish_date, book_ISBN, book_introduction, book_content, book_image_url): self.book_name = book_name self.book_author = book_author self.book_publisher = book_publisher self.book_publish_date = book_publish_date self.book_ISBN = book_ISBN self.book_introduction = book_introduction self.book_content = book_content self.book_image_url = book_image_url
def displayBook(self): print "书名: ", self.book_name, "\n作者: ", self.book_author, "\n出版社:", self.book_publisher, \ "\n出版日期:", self.book_publish_date, "\nISBN:", self.book_ISBN, "\n简介:", self.book_introduction, \ "\n目录:", self.book_content, "\n缩略图地址:", self.book_image_url
def get_book_detail(url): try: html = urllib2.urlopen(url).read() book_name = re.search(r'<div style=\"font-size: 20px;font-weight:bold;margin-bottom:13px;\">(.*?)</div>', html, re.S).group(1) book_author = re.search(r'<a href=\"/book/listByAuthor\?keyword=.*?\">(.*?)</a>', html, re.S).group(1) book_publisher_date = re.search(r'str = \"(.*?)\"', html, re.S).group(1) if book_publisher_date: book_publisher = re.split(r'[,\s]\s*', book_publisher_date)[0] book_publish_date = re.split(r'[,\s]\s*', book_publisher_date)[1] else: book_publisher = '' book_publish_date = ''
book_ISBN = re.search(r'url: \"/book/getErrata\.json\?mixedIsbn=(.*?)\"', html, re.S).group(1) book_introduction = re.search(r'<p >\ \;\ \;(.*?)</p>', html, re.S).group(1) book_content = re.search(r'<p id=\"book_catalog\">(.*?)</p>', html, re.S).group(1) if not book_content: book_content = '' book_image_url = re.search(r'<img src=\"(http\://cover.yuntu.io/.*?/' + str(book_ISBN) + '\.jpg)\" >', html, re.S).group(1)
book = Book(book_name, book_author, book_publisher, book_publish_date, book_ISBN, book_introduction, book_content, book_image_url)
except urllib2.URLError as e: print 'Download error:', e.reason return book
def get_file(url): try: cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener)
req = urllib2.Request(url) operate = opener.open(req) data = operate.read() return data except BaseException, e: print e return None
''' 保存文件到本地
@path 本地路径 @file_name 文件名 @data 文件内容 '''
def save_file(path, file_name, data): if data == None: return
file = open(path + file_name, "wb") file.write(data) file.flush() file.close()
def download(url): print 'Downloading:', url try: if url not in seen: seen.add(url) book = get_book_detail(url) book.displayBook() if not book.book_name: return sql = """INSERT INTO Book(BookName, BookImage, BookWriter, BookPublisher, BookPublicationDate, ISBN, BookAbstract, BookContent) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""
param = ( book.book_name, "BookCover/" + book.book_ISBN + ".jpg", book.book_author, book.book_publisher, book.book_publish_date, book.book_ISBN, book.book_introduction, book.book_content) conn.insert(sql, param) book_image_name = book.book_ISBN + ".jpg" save_file("/root/BookCover/", book_image_name, get_file(book.book_image_url)) except urllib2.URLError as e: print 'Download error:', e.reason
if __name__ == '__main__': conn = MySQL('127.0.0.1', 'username', 'password', 3306) conn.selectDb('YourDatabaseName')
crawl_queue = [] seen = set(crawl_queue) file = open("url.txt") count = 0 for line in file.readlines()[count:]: print "***************正在打印第%d行****************" % count count += 1 download(line) conn.commit() conn.close()
|