
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests import re import os def getHTMLText(url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"} try: r = requests.get(url,headers=headers) r.raise_for_status() return r.text except requests.exceptions.RequestException as e: print(e) def getURLList(html): regex = r"( http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png)" lst = [] matches = re.finditer(regex, html, re.MULTILINE) for x,y in enumerate(matches): try: lst.append(str(y.group())) except: continue return sorted(set(lst),key = lst.index) def download(lst,filepath='img'): if not os.path.isdir(filepath): os.makedirs(filepath) filecounter = len(lst) filenow = 1 for url in lst: headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"} filename = filepath +'/' + url.split('/')[-1] with open(filename,'wb') as f : try: img = requests.get(url,headers=headers) img.raise_for_status() print("Downloading {}/{} file name:{}".format(filenow,filecounter,filename.split('/')[-1])) filenow += 1 f.write(img.content) f.flush() f.close() print("{} saved".format(filename)) except requests.exceptions.RequestException as e: print(e) continue if __name__ == '__main__': url = input('please input the image url:') filepath = input('please input the download path:') html = getHTMLText(url) lst = getURLList(html) download(lst,filepath) 
遇到个问题,图片名字一样时只会得到最后一张图片。加了个判断和随机前缀。
import random # 在 with open(filename,'w') as f 前面添加下面的代码 if os.path.isfile(filename): filename = str(random.randint(1,10000)) + os.path.basename(filename) 1 soho176 2018 年 5 月 21 日 urllib.urlretrieve 这个下图片不错 你试试 |
3 cy97cool 2018 年 5 月 21 日 via Android 没考虑中文文件名图片吧 需要 urldecode 一下 另外要不要处理文件名中的特殊符号 可能不能作为文件名的 url? |
4 liyiecho 2018 年 5 月 21 日 下载之类的,我觉得还是调用 aira2 来下载比较好,aria2 可以保证下载内容的完整性。如果用 python 模块下载的话,当遇到网络问题或者报错的时候,下载的内容可能不是完整的了。 |
5 ucun OP @soho176 #1 urlretrieve 下载图片坑多。图片模糊、打不开等等 ```python #!/usr/bin/env python # -*- coding:utf-8 -*- from urllib.request import Request,urlopen,urlretrieve from urllib.error import HTTPError import re import os def getHTMLText(url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) try: with urllib.request.urlopen(req) as f: return f.read().decode('utf-8') except HTTPError as e: print('Error code:',e.code) def getURLList(html): regex = r"( http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png)" lst = [] matches = re.finditer(regex, html, re.MULTILINE) for x,y in enumerate(matches): try: lst.append(str(y.group())) except: continue return sorted(set(lst),key = lst.index) def download(lst,filepath='img'): if not os.path.isdir(filepath): os.makedirs(filepath) filecounter = len(lst) filenow = 1 for url in lst: filename = filepath +'/' + url.split('/')[-1] opener = urllib.request.build_opener() opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")] urllib.request.install_opener(opener) urllib.request.urlretrieve(url,filename) if __name__ == '__main__': url = input('please input the image url:') filepath = input('please input the download path:') html = getHTMLText(url) lst = getURLList(html) download(lst,filepath) ``` |