|
| 1 | +from threading import Thread |
| 2 | +import requests |
| 3 | +import random |
| 4 | +import time |
| 5 | +import re |
| 6 | +import os |
| 7 | + |
| 8 | + |
| 9 | +class MultithreadCrawler: |
| 10 | + def __init__(self): |
| 11 | + self.Links = [] |
| 12 | + self.LinksFailed = [] |
| 13 | + self.Pages = [] |
| 14 | + self.Images = [] |
| 15 | + self.UA = [ |
| 16 | + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", |
| 17 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", |
| 18 | + "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", |
| 19 | + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", |
| 20 | + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", |
| 21 | + "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", |
| 22 | + "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", |
| 23 | + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", |
| 24 | + "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", |
| 25 | + "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", |
| 26 | + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", |
| 27 | + "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", |
| 28 | + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", |
| 29 | + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", |
| 30 | + "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", |
| 31 | + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", |
| 32 | + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", |
| 33 | + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", |
| 34 | + "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", |
| 35 | + "Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1", |
| 36 | + "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3", |
| 37 | + "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12", |
| 38 | + "Opera/9.27 (Windows NT 5.2; U; zh-cn)", |
| 39 | + "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13", |
| 40 | + "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ", |
| 41 | + "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ", |
| 42 | + "Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ", |
| 43 | + "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7", |
| 44 | + "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ", |
| 45 | + "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ", |
| 46 | + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", |
| 47 | + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", |
| 48 | + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36", |
| 49 | + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", |
| 50 | + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36" |
| 51 | + ] |
| 52 | + self.sleeptime = 0.1 |
| 53 | + self.threadnums = 12 |
| 54 | + if not os.path.exists("./MultithreadCrawlerImages"): |
| 55 | + os.mkdir("./MultithreadCrawlerImages") |
| 56 | + self.image_path = "./MultithreadCrawlerImages" |
| 57 | + |
| 58 | + def getLinks(self, link, minpagenum, maxpagenum): |
| 59 | + """ |
| 60 | + 构造需要抓取的页面链接,放在队列中供getPageContent使用 |
| 61 | + 实例page链接:http://jandan.net/pic/page-166 |
| 62 | + link: http://jandan.net/pic/page- |
| 63 | + minpagenum: 1 |
| 64 | + maxpagenum: 192 |
| 65 | + :return: |
| 66 | + """ |
| 67 | + for index in range(minpagenum, maxpagenum + 1): |
| 68 | + self.Links.append("{}{}".format(link, index)) |
| 69 | + self.Links.append("#END#") #在队列中添加结束标志 |
| 70 | + |
| 71 | + def getPageContent(self): |
| 72 | + """ |
| 73 | + 根据链接抓取相应页面,将结果放在Pages队列中 |
| 74 | + :return: |
| 75 | + """ |
| 76 | + head = { |
| 77 | + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| 78 | + "Accept-Encoding": "gzip, deflate", |
| 79 | + "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", |
| 80 | + "Connection": "keep-alive", |
| 81 | + "Host": "jandan.net", |
| 82 | + "Upgrade-Insecure-Requests": "1", |
| 83 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0" |
| 84 | + } |
| 85 | + |
| 86 | + while True: |
| 87 | + |
| 88 | + tmp_link = self.Links.pop(0) |
| 89 | + # 遇到结束标志 退出线程 |
| 90 | + if tmp_link == "#END#": |
| 91 | + self.Links.append("#END#") |
| 92 | + break |
| 93 | + # 这个网站没必要更换header |
| 94 | + # head.update({"User-Agent": random.choice(self.UA)}) |
| 95 | + try: |
| 96 | + #下载网页并添加到Pages列表 |
| 97 | + result = requests.get(tmp_link, headers=head).text |
| 98 | + self.Pages.append(result) |
| 99 | + except BaseException as e: |
| 100 | + self.LinksFailed.append(tmp_link) |
| 101 | + print("Failed Link:{}".format(tmp_link)) |
| 102 | + #time.sleep(self.sleeptime) |
| 103 | + |
| 104 | + self.Pages.append("#END#") |
| 105 | + |
| 106 | + def getImageLink(self): |
| 107 | + """ |
| 108 | + 从getPageContent的抓取结果中,提取图片链接,放在队列中 |
| 109 | + :return: |
| 110 | + """ |
| 111 | + while True: |
| 112 | + #页面队列为空 等待0.01S |
| 113 | + while not self.Pages: |
| 114 | + time.sleep(0.01) |
| 115 | + |
| 116 | + content = self.Pages.pop(0) |
| 117 | + # 遇到结束标志 退出线程 |
| 118 | + if content == "#END#": |
| 119 | + self.Pages.append("#END#") |
| 120 | + break |
| 121 | + #提取图片链接并添加到Images列表 |
| 122 | + for item in re.findall("查看原图.+?img src=\\\"(.+?)\\\"", content): |
| 123 | + self.Images.append(item) |
| 124 | + self.Images.append("#END#") #添加结束标志 |
| 125 | + |
| 126 | + def getImage(self): |
| 127 | + """ |
| 128 | + 从队列中获取一个图片抓取链接,进行抓取并保存 |
| 129 | + :return: |
| 130 | + """ |
| 131 | + while True: |
| 132 | + #队列为空等待0.01S |
| 133 | + while not self.Images: |
| 134 | + time.sleep(0.01) |
| 135 | + |
| 136 | + url = self.Images.pop(0) |
| 137 | + #遇到END标志,推出线程 |
| 138 | + if url == "#END#": |
| 139 | + self.Images.append("#END#") |
| 140 | + break |
| 141 | + try: |
| 142 | + #构造参数 |
| 143 | + img_name = self.image_path + "/" + url.split("/")[-1] |
| 144 | + img_host = url.split("/")[2] |
| 145 | + img_url = "http:" + url |
| 146 | + head = { |
| 147 | + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| 148 | + "Accept-Encoding": "gzip, deflate", |
| 149 | + "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", |
| 150 | + "Connection": "keep-alive", |
| 151 | + "Host": img_host, |
| 152 | + "Upgrade-Insecure-Requests": "1", |
| 153 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0" |
| 154 | + } |
| 155 | + #下载图片 |
| 156 | + content = requests.get(url=img_url, headers=head).content |
| 157 | + #保存到指定位置 |
| 158 | + with open(img_name, "wb") as fw: |
| 159 | + fw.write(content) |
| 160 | + except BaseException as e: |
| 161 | + print("failed img link: ", url) |
| 162 | + continue |
| 163 | + |
| 164 | + def main(self): |
| 165 | + """ |
| 166 | + 组织抓取过程 |
| 167 | + :return: |
| 168 | + """ |
| 169 | + t_list = [] |
| 170 | + self.getLinks("http://jandan.net/pic/page-", 1, 192) #构造链接 |
| 171 | + for index in range(0, self.threadnums): #获取页面线程 |
| 172 | + t_list.append(Thread(target=self.getPageContent)) |
| 173 | + for index in range(0, self.threadnums): #提取图片链接线程 |
| 174 | + t_list.append(Thread(target=self.getImageLink)) |
| 175 | + for index in range(0, self.threadnums * 2): #获取图片线程 |
| 176 | + t_list.append(Thread(target=self.getImage)) |
| 177 | + |
| 178 | + for t in t_list: #启动线程 |
| 179 | + t.start() |
| 180 | + for t in t_list: #等待结束 |
| 181 | + t.join() |
| 182 | + |
| 183 | + |
| 184 | +if __name__ == '__main__': |
| 185 | + st = time.time() |
| 186 | + solution = MultithreadCrawler() |
| 187 | + solution.main() |
| 188 | + print(time.time()-st) |
0 commit comments