百度网盘爬虫(如何爬取百度网盘) - V2EX
V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
推荐学习书目
Learn Python the Hard Way
Python Sites
PyPI - Python Package Index
http://diveintopython.org/toc/index.html
Pocoo
值得关注的项目
PyPy
Celery
Jinja2
Read the Docs
gevent
pyenv
virtualenv
Stackless Python
Beautiful Soup
结巴中文分词
Green Unicorn
Sentry
Shovel
Pyflakes
pytest
Python 编程
pep8 Checker
Styles
PEP 8
Google Python Style Guide
Code Style from The Hitchhiker's Guide
3380626465
V2EX    Python

百度网盘爬虫(如何爬取百度网盘)

  •  1
     
  •   3380626465 2017-03-20 09:36:36 +08:00 10347 次点击
    这是一个创建于 3185 天前的主题,其中的信息可能已经有所发展或是发生改变。

    因为要做去转盘网(分类模式点我),所以一定要爬取网盘资源,本来想自己写一个爬虫挺不容易的,不想分享出来,但最后还是决定了拿给大家一起看吧,毕竟有交流才有进步,有兴趣的朋友也可以看看我写的其他日志或者关注我,会发现去转盘网的大部分技术现在可以说是公开状态,如有对你有帮助还是认真读读吧,下面是爬虫代码,我立马公开:

    ps :不会 python 的孩子先去学学 python ,代码是 python 写的

    我附上点资料:点我下载 1 点我下载 2

    其实还有个磁力站,不过暂时技术不想公开出来,之后也想公开,喜欢的看看: ok 搜搜

    #coding: utf8 """ author:haoning create time: 2015-8-15 """ import re #正则表达式模块 import urllib2 #获取 URLs 的组件 import time from Queue import Queue import threading, errno, datetime import json import requests #Requests is an Apache2 Licensed HTTP library import MySQLdb as mdb DB_HOST = '127.0.0.1' DB_USER = 'root' DB_PASS = '' #以下是正则匹配规则 re_start = re.compile(r'start=(\d+)') #\d 表示 0-9 任意一个数字 后面有+号 说明这个 0-9 单个数位出现一到多次 比如 21312314 re_uid = re.compile(r'query_uk=(\d+)') #查询编号 re_urlid = re.compile(r'&urlid=(\d+)') #url 编号 OnEPAGE= 20 #一页数据量 OnESHAREPAGE= 20 #一页分享连接量 #缺少专辑列表 URL_SHARE = 'http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&start={start}&limit=20&query_uk={uk}&urlid={id}' #获得分享列表 """ {"feed_type":"share","category":6,"public":"1","shareid":"1541924625","data_id":"2418757107690953697","title":"\u5723\u8bde\u58c1\u7eb8\u5927\u6d3e\u9001","third":0,"clienttype":0,"filecount":1,"uk":1798788396,"username":"SONYcity03","feed_time":1418986714000,"desc":"","avatar_url":"http:\/\/himg.bdimg.com\/sys\/portrait\/item\/1b6bf333.jpg","dir_cnt":1,"filelist":[{"server_filename":"\u5723\u8bde\u58c1\u7eb8\u5927\u6d3e\u9001","category":6,"isdir":1,"size":1024,"fs_id":870907642649299,"path":"%2F%E5%9C%A3%E8%AF%9E%E5%A3%81%E7%BA%B8%E5%A4%A7%E6%B4%BE%E9%80%81","md5":"0","sign":"1221d7d56438970225926ad552423ff6a5d3dd33","time_stamp":1439542024}],"source_uid":"871590683","source_id":"1541924625","shorturl":"1dDndV6T","vCnt":34296,"dCnt":7527,"tCnt":5056,"like_status":0,"like_count":60,"comment_count":19}, public:公开分享 title:文件名称 uk:用户编号 """ URL_FOLLOW = 'http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk={uk}&limit=20&start={start}&urlid={id}' #获得订阅列表 """ {"type":-1,"follow_uname":"\u597d\u55e8\u597d\u55e8\u554a","avatar_url":"http:\/\/himg.bdimg.com\/sys\/portrait\/item\/979b832f.jpg","intro":"\u9700\u8981\u597d\u8d44\u6599\u52a0994798392","user_type":0,"is_vip":0,"follow_count":2,"fans_count":2276,"follow_time":1415614418,"pubshare_count":36,"follow_uk":2603342172,"album_count":0}, follow_uname:订阅名称 fans_count :粉丝数 """ URL_FANS = 'http://yun.baidu.com/pcloud/friend/getfanslist?query_uk={uk}&limit=20&start={start}&urlid={id}' # 获取关注列表 """ {"type":-1,"fans_uname":"\u62e8\u52a8\u795e\u7684\u5fc3\u7eea","avatar_url":"http:\/\/himg.bdimg.com\/sys\/portrait\/item\/d5119a2b.jpg","intro":"","user_type":0,"is_vip":0,"follow_count":8,"fans_count":39,"follow_time":1439541512,"pubshare_count":15,"fans_uk":288332613,"album_count":0} avatar_url :头像 fans_uname :用户名 """ QNUM = 1000 hc_q = Queue(20) #请求队列 hc_r = Queue(QNUM) #接收队列 success = 0 failed = 0 def req_worker(inx): #请求 s = requests.Session() #请求对象 while True: req_item = hc_q.get() #获得请求项 req_type = req_item[0] #请求类型,分享?订阅?粉丝? url = req_item[1] #url r = s.get(url) #通过 url 获得数据 hc_r.put((r.text, url)) #将获得数据文本和 url 放入接收队列 print "req_worker#", inx, url #inx 线程编号; url 分析了的 url def response_worker(): #处理工作 dbcOnn= mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8') dbcurr = dbconn.cursor() dbcurr.execute('SET NAMES utf8') dbcurr.execute('set global wait_timeout=60000') #以上皆是数据库操作 while True: """ #正则备注 match() 决定 RE 是否在字符串刚开始的位置匹配 search() 扫描字符串,找到这个 RE 匹配的位置 findall() 找到 RE 匹配的所有子串,并把它们作为一个列表返回 finditer() 找到 RE 匹配的所有子串,并把它们作为一个迭代器返回 百度页面链接: http://pan.baidu.com/share/link?shareid=3685432306&uk=1798788396&from=hotrec uk 其实用户 id 值 """ metadata, effective_url = hc_r.get() #获得 metadata (也就是前面的 r.text )和有效的 url #print "response_worker:", effective_url try: tnow = int(time.time()) #获得当前时间 id re_urlid.findall(effective_url)[0] #获得 re_urlid 用户编号 start = re_start.findall(effective_url)[0] #获得 start 用户编号 if True: if 'getfollowlist' in effective_url: #type = 1 ,也就是订阅类 follows = json.loads(metadata) #以将文本数据转化成 json 数据格式返回 uid = re_uid.findall(effective_url)[0] #获得 re_uid ,查询编号 if "total_count" in follows.keys() and follows["total_count"]>0 and str(start) == "0": #获得订阅数量 for i in range((follows["total_count"]-1)/ONEPAGE): #开始一页一页获取有用信息 try: dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 1, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE))) #存储 url 编号,订阅中有用户编号, start 表示从多少条数据开始获取,初始 status=0 为未分析状态 except Exception as ex: print "E1", str(ex) pass if "follow_list" in follows.keys(): #如果订阅者也订阅了,即拥有 follow_list for item in follows["follow_list"]: try: dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)' % (item['follow_uk'], item['follow_uname'], str(tnow))) #存储订阅这的用户编号,用户名,入库时间 except Exception as ex: print "E13", str(ex) pass else: print "delete 1", uid, start dbcurr.execute('delete from urlids where uk=%s and type=1 and start>%s' % (uid, start)) elif 'getfanslist' in effective_url: #type = 2,也就是粉丝列表 fans = json.loads(metadata) uid = re_uid.findall(effective_url)[0] if "total_count" in fans.keys() and fans["total_count"]>0 and str(start) == "0": for i in range((fans["total_count"]-1)/ONEPAGE): try: dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 2, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE))) except Exception as ex: print "E2", str(ex) pass if "fans_list" in fans.keys(): for item in fans["fans_list"]: try: dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)' % (item['fans_uk'], item['fans_uname'], str(tnow))) except Exception as ex: print "E23", str(ex) pass else: print "delete 2", uid, start dbcurr.execute('delete from urlids where uk=%s and type=2 and start>%s' % (uid, start)) else: #type=0 ,也即是分享列表 shares = json.loads(metadata) uid = re_uid.findall(effective_url)[0] if "total_count" in shares.keys() and shares["total_count"]>0 and str(start) == "0": for i in range((shares["total_count"]-1)/ONESHAREPAGE): try: dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 0, 0)' % (uid, str(ONESHAREPAGE*(i+1)), str(ONESHAREPAGE))) except Exception as ex: print "E3", str(ex) pass if "records" in shares.keys(): for item in shares["records"]: try: dbcurr.execute('INSERT INTO share(userid, filename, shareid, status) VALUES(%s, "%s", %s, 0)' % (uid, item['title'], item['shareid'])) #item['title']恰好是文件名称 #返回的 json 信息: except Exception as ex: #print "E33", str(ex), item pass else: print "delete 0", uid, start dbcurr.execute('delete from urlids where uk=%s and type=0 and start>%s' % (uid, str(start))) dbcurr.execute('delete from urlids where id=%s' % (id, )) dbconn.commit() except Exception as ex: print "E5", str(ex), id dbcurr.close() dbconn.close() #关闭数据库 def worker(): global success, failed dbcOnn= mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8') dbcurr = dbconn.cursor() dbcurr.execute('SET NAMES utf8') dbcurr.execute('set global wait_timeout=60000') #以上是数据库相关设置 while True: #dbcurr.execute('select * from urlids where status=0 order by type limit 1') dbcurr.execute('select * from urlids where status=0 and type>0 limit 1') #type>0,为非分享列表 d = dbcurr.fetchall() #每次取出一条数据出来 #print d if d: #如果数据存在 id = d[0][0] #请求 url 编号 uk = d[0][1] #用户编号 start = d[0][2] limit = d[0][3] type = d[0][4] #哪种类型 dbcurr.execute('update urlids set status=1 where id=%s' % (str(id),)) #状态更新为 1 ,已经访问过了 url = "" if type == 0: #分享 url = URL_SHARE.format(uk=uk, start=start, id=id).encode('utf-8') #分享列表格式化 #query_uk uk 查询编号 #start #urlid id url 编号 elif type == 1: #订阅 url = URL_FOLLOW.format(uk=uk, start=start, id=id).encode('utf-8') #订阅列表格式化 elif type == 2: #粉丝 url = URL_FANS.format(uk=uk, start=start, id=id).encode('utf-8') #关注列表格式化 if url: hc_q.put((type, url)) #如果 url 存在,则放入请求队列, type 表示从哪里获得数据 #通过以上的 url 就可以获得相应情况下的数据的 json 数据格式,如分享信息的,订阅信息的,粉丝信息的 #print "processed", url else: #否则从订阅者或者粉丝的引出人中获得信息来存储,这个过程是爬虫树的下一层扩展 dbcurr.execute('select * from user where status=0 limit 1000') d = dbcurr.fetchall() if d: for item in d: try: dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 0, 0)' % (item[1], str(ONESHAREPAGE))) #uk 查询号,其实是用户编号 #start 从第 1 条数据出发获取信息 # dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 1, 0)' % (item[1], str(ONEPAGE))) dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 2, 0)' % (item[1], str(ONEPAGE))) dbcurr.execute('update user set status=1 where userid=%s' % (item[1],)) #做个标志,该条数据已经访问过了 #跟新了分享,订阅,粉丝三部分数据 except Exception as ex: print "E6", str(ex) else: time.sleep(1) dbconn.commit() dbcurr.close() dbconn.close() def main(): print 'starting at:',now() for item in range(16): t = threading.Thread(target = req_worker, args = (item,)) t.setDaemon(True) t.start() #请求线程开启,共开启 16 个线程 s = threading.Thread(target = worker, args = ()) s.setDaemon(True) s.start() #worker 线程开启 response_worker() #response_worker 开始工作 print 'all Done at:', now() 

    本人建个qq群,欢迎大家一起交流技术, 群号:512245829 喜欢微博的朋友关注:转盘娱乐即可

    21 条回复    2017-03-30 10:21:05 +08:00
    icherler
        1
    icherler  
       2017-03-20 10:34:24 +08:00
    之前用过去转盘,感觉不错主要是可以把失效的链接标记出来。已关注,还有就是谢谢分享。
    araraloren
        2
    araraloren  
       2017-03-20 10:35:11 +08:00
    mark 一下,有空看看
    xvx
        3
    xvx  
       2017-03-20 10:40:39 +08:00 via iPhone
    还在用 PY2 ?
    ipoh
        4
    ipoh  
       2017-03-20 10:43:58 +08:00   1
    github 上一大堆你还不舍得分享。。
    vipwpcom
        5
    vipwpcom  
       2017-03-20 10:46:47 +08:00
    @ipoh 不要瞎说大实话
    sangmong
        6
    sangmong  
       2017-03-20 11:23:54 +08:00
    mark
    master13
        7
    master13  
       2017-03-20 11:48:32 +08:00   1
    @xvx [捂脸]这也喷,别人写个代码,先喷功能不好,再喷代码样式差,实在不行还能喷语言比较老、风格比我 low ……

    世界就不能多一点爱吗……/div>
    fhefh
        8
    fhefh  
       2017-03-20 11:57:07 +08:00
    zqcolor
        9
    zqcolor  
       2017-03-20 12:14:14 +08:00
    谢谢分享
    maemolee
        10
    maemolee  
       2017-03-20 13:26:23 +08:00
    @master13 文人相轻嘛。
    xvx
        12
    xvx  
       2017-03-20 15:38:03 +08:00 via iPhone
    @master13 你的内心还真黑暗。
    原话:还在用 PY2 ?
    我就问一下,你竟然能解读出喷的意思……
    后面竟然还有人同样理解的,我还能说啥。
    xkx
        13
    xkx  
       2017-03-20 23:21:39 +08:00
    他们不是一两个人,我也解读出喷的意思了
    we3613040
        14
    we3613040  
       2017-03-21 00:48:33 +08:00
    mark
    hitmanx
        15
    hitmanx  
       2017-03-21 12:56:17 +08:00
    @xvx 我读出来的语气也是对于用 py2 有点鄙夷~
    styletjy
        16
    styletjy  
       2017-03-21 13:15:37 +08:00
    @xvx 你要是加个手动滑稽可能效果就不一样了
    xvx
        17
    xvx  
       2017-03-21 14:05:25 +08:00
    @hitmanx
    @styletjy 我只能说我们有代沟。
    J0022ZjV7055oN64
        18
    J0022ZjV7055oN64  
       2017-03-21 15:40:57 +08:00
    我们也有代沟
    master13
        19
    master13  
       2017-03-29 10:49:25 +08:00
    @xvx 哥哥,我一个人能读出来,说明我心里黑暗,我心里变态。 N 多人都能读出来,究竟说明谁心里变态?究竟打了谁的脸?
    xvx
        20
    xvx  
       2017-03-29 12:54:35 +08:00 via iPhone
    @master13 有很多东西不是以人数来决定的。
    PS :这么小的事,过了这么久你还拿出来喷……
    master13
        21
    master13  
       2017-03-30 10:21:05 +08:00
    @xvx 我也不是经常上 V2 ,昨天才看到……
    关于     帮助文档     自助推广系统     博客     API     FAQ     Solana     3183 人在线   最高记录 6679       Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 26ms UTC 11:42 PVG 19:42 LAX 03:42 JFK 06:42
    Do have faith in what you're doing.
    ubao msn snddm index pchome yahoo rakuten mypaper meadowduck bidyahoo youbao zxmzxm asda bnvcg cvbfg dfscv mmhjk xxddc yybgb zznbn ccubao uaitu acv GXCV ET GDG YH FG BCVB FJFH CBRE CBC GDG ET54 WRWR RWER WREW WRWER RWER SDG EW SF DSFSF fbbs ubao fhd dfg ewr dg df ewwr ewwr et ruyut utut dfg fgd gdfgt etg dfgt dfgd ert4 gd fgg wr 235 wer3 we vsdf sdf gdf ert xcv sdf rwer hfd dfg cvb rwf afb dfh jgh bmn lgh rty gfds cxv xcv xcs vdas fdf fgd cv sdf tert sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf shasha9178 shasha9178 shasha9178 shasha9178 shasha9178 liflif2 liflif2 liflif2 liflif2 liflif2 liblib3 liblib3 liblib3 liblib3 liblib3 zhazha444 zhazha444 zhazha444 zhazha444 zhazha444 dende5 dende denden denden2 denden21 fenfen9 fenf619 fen619 fenfe9 fe619 sdf sdf sdf sdf sdf zhazh90 zhazh0 zhaa50 zha90 zh590 zho zhoz zhozh zhozho zhozho2 lislis lls95 lili95 lils5 liss9 sdf0ty987 sdft876 sdft9876 sdf09876 sd0t9876 sdf0ty98 sdf0976 sdf0ty986 sdf0ty96 sdf0t76 sdf0876 df0ty98 sf0t876 sd0ty76 sdy76 sdf76 sdf0t76 sdf0ty9 sdf0ty98 sdf0ty987 sdf0ty98 sdf6676 sdf876 sd876 sd876 sdf6 sdf6 sdf9876 sdf0t sdf06 sdf0ty9776 sdf0ty9776 sdf0ty76 sdf8876 sdf0t sd6 sdf06 s688876 sd688 sdf86