
又在腾云阁发现一篇 Python 的爬虫文章,顺便存了。
收录待用,修改转载已取得腾讯云授权
节选:
...
这是用来下载美图网上 100 个页面的所有的图片
import requests import re import time from redis import Redis headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } def push_redis_list(): r = Redis(host='10.66.149.8',port=6379,password='') for i in range(100): num = 5100+i; url ='http://www.meizitu.com/a/'+ str(num) +'.html' img_url = requests.get(url,timeout=30) #print img_url.text #time.sleep(10) img_url_list = re.findall('http://mm.howkuai.com/wp-content/uploads/201.*.jpg',img_url.text) print(img_url_list) for temp_img_url in img_url_list: l = len(re.findall('limg',temp_img_url)) #print l if(l == 0): print("url: ",temp_img_url) r.lpush('meizitu',temp_img_url) print(r.llen('meizitu')) return 0 def get_big_img_url(): r = Redis(host='10.66.149.8',port=6379,password='') while(1): try: url = r.lpop('meizitu') download(url) time.sleep(1) print(url) except: print("请求求发送失败重试") time.sleep(10) continue return 0 def download(url): try: r = requests.get(url,headers=headers,timeout = 50) name = int(time.time()) f = open('./pic/'+str(name)+'.jpg','wb') f.write(r.content) f.close() except Exception as e: print(Exception,":",e) if __name__ == '__main__': url = 'http://www.meizitu.com/a/list_1_' print("begin") push_redis_list()#开启则加任务队列 #get_big_img_url()#开启则运行爬取任务 ...
原文链接: https://www.qcloud.com/community/article/337567001488804157