pythone爬真实自拍
前几天有为老兄发了个真实自拍.com,发现站上面的图都挺不错的,但是看起来有不够爽,就改了个爬图脚本爬了14多G的图片。
有需要的福娃可以自己爬下。D盘建个1024文件夹,跑起来就可以了,26楼的大哥代码大家也可以试试。页数越后的图床失效机率越大
不说了,我去买纸巾了。
import urllib.request,socket,re,sys,os,pathlib,time,random
baseUrl=’https://xn--qbt00o3ns2fk.xyz/’
targetPath = “D:1024”
def getContant(Weburl):
Webheader= {‘Upgrade-Insecure-Requests’:’1′,
‘User-Agent’:’Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36′,}
req = urllib.request.Request(url = Weburl,headers=Webheader)
respose = urllib.request.urlopen(req)
_contant = respose.read()
respose.close()
return str(_contant)
def gettitle(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ ‘Chrome/51.0.2704.63 Safari/537.36′
}
req=urllib.request.Request(url=url,headers=headers)
file=urllib.request.urlopen(req)
html=file.read().decode(“utf-8”,’ignore’)
title=re.findall(‘
(.+)
‘,html)
return str(title)
def getUrl(URL):
for i in range(1,256):
Weburl = URL
contant = getContant(Weburl)
comp = re.compile(r’content_d*.html’)
urlList1 = comp.findall(contant)
urlList = []
for url1 in urlList1:
url2 = baseUrl+url1
urlList.append(url2)
return urlList
def openUrl(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ ‘Chrome/51.0.2704.63 Safari/537.36′
}
title=gettitle(url)
title=title[2:-2]
filePath=targetPath+title
if os.path.isdir(filePath):
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
data = res.read()
downImg(data,filePath)
else:
os.mkdir(filePath)
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
data = res.read()
downImg(data,filePath)
def downImg(data,filePath):
for link,t in set(re.findall(r'([http|https]:[^s]*?(jpg|png))’, str(data))):
if link.startswith(‘s’):
link=’http’+link
else:
link=’htt’+link
print(link)
strpicpath=seFile(link,filePath)
picpath=pathlib.Path(strpicpath)
if not picpath.exists():
socket.setdefaulttimeout(30)
try:
try :
opener=urllib.request.build_opener()
opener.addheaders=[(‘User-Agent’,’Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36′)]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(link,strpicpath)
time.sleep(random.uniform(0,0.5))
except:
print(“faild!”)
except socket.timeout:
count = 1
while count <= 5: try: urllib.request.urlretrieve(link,strpicpath) break except socket.timeout: count += 1 if count > 5:
print(“faild”)
else:
print(“exist”)
def seFile(path,filePath):
pos = path.rindex(‘/’)
t = os.path.join(filePath,path[pos+1:])
return t
def openPage(UrlList):
for pageUlr in UrlList:
socket.setdefaulttimeout(30)
try:
try:
print(pageUlr)
openUrl(pageUlr)
## article_url=re.findall(r’
(?!<.*>).*
‘,str(get.content,’gbk’,errors=’ignore’))
except:
print(pageUlr+’ faild’)
except socket.timeout:
count = 1
while count <= 5: try: openUrl(pageUlr) ## article_url=re.findall(r'
(?!<.*>).*
‘,str(get.content,’gbk’,errors=’ignore’))
break
except socket.timeout:
count += 1
if count > 5:
print(pageUlr+’ faild’)
URL = baseUrl+’index_’
for num in range(1,255):
try:
os.system(‘cls’)
print(“#######################################”)
print(“##########download#############################”)
print(URL+str(num)+’.html’)
print(“#######################################”)
print(“#######################################”)
UrlList = getUrl(URL+str(num)+’.html’)
openPage(UrlList)
except:
print(‘faild’)复制代码