请注意,本文编写于 90 天前,最后修改于 31 天前,其中某些信息可能已经过时。
太久没写过python,再加上之前写的p站爬虫过期,以此来复习一遍爬虫。
首先这里的p站不是pixiv,而是https://pixivic.com/,选择它的原因有几点
打开F12,这个请求的地址就是我们要找的画师插画接口。
让我们来分析一下这个接口。
https://pix.ipv4.host/artists/画师ID/illusts/illust?page=页码&pageSize=30&maxSanityLevel=4
接下来尝试用python发个请求
返回了我们需要的值
--(但有时候可能会提示登陆,只需在请求头里加上这个即可authorization)
往下翻就可以看到我们需要的原始链接了
所以原始链接所在的层级结构为["data"][0]["imageUrls"][0]["original"]
这里我定义了一个图片类,方便储存插画信息
page = 1
Artist_id = ''
error = 0
class Pictures:
def __init__(self,title,id,width,height,url):
self.title = title
#插画标题
self.id = id
#插画id
self.width = width
#插画宽
self.height = height
#插画高
self.url = url #插画链接
获取画师的信息并输出
# 获取作家信息
def GetArtist(Artist_id, headers):
global error
mesUrl = "https://pix.ipv4.host/artists/" + Artist_id
try:
rep = requests.get(mesUrl, headers=headers, timeout=6)
rep.encoding = 'utf-8' # 防止乱码
except Exception as err: # 设置超时
print(err)
error += 1
if rep.status_code == 200:
json_str = json.loads(rep.text)
name = json_str["data"]["name"] #名称
gender = json_str["data"]["gender"] #性别
birthday = json_str["data"]["birthDay"]#生日
region = json_str["data"]["region"]#地址
print("画师名称:" + name + "\n"\
"性别\t:" + gender + "\n"\
"出生日期:" + birthday + "\n"\
"住址\t:" + region + "\n"\
)
return json_str
获取原始图片
# 获取图片
def GetOrigin(Artist_id , headers , page):
global error
try:
imgs = []
api_url = "https://pix.ipv4.host/artists/" + Artist_id +\
"/illusts/illust?page=" + str(page) + "&pageSize=30&maxSanityLevel=4"
rep = requests.get(api_url, headers=headers)
rep.encoding = 'utf-8' # 防止乱码
if (rep.text != '{"message":"获取画师画作列表成功"}'):
json_str = json.loads(rep.text)
for i in range(len(json_str["data"])):
title = json_str["data"][i]["title"]
id = json_str["data"][i]["id"]
width = json_str["data"][i]["width"]
height = json_str["data"][i]["height"]
imgurl = json_str["data"][i]["imageUrls"][0]["original"].replace("i.pximg.net","original.img.pixivic.net")
img = Pictures(title,id,width,height,imgurl)
imgs.append(img)
#print(img.url)
return imgs
except Exception as err:
print(err)
error += 1
return imgs
#获取所有图片
def GetAllImgs(Artist_id , headers):
page = 1
imgs = []
while True:
img = GetOrigin(Artist_id,headers,page)
if (img == None):
break
imgs += img
page += 1
return imgs
设置下载地址
# 目录
def Set_path(path):
if path[-1:] != '/' or path[-1:] != '\\':
path = path + '/'
path_exist = os.path.exists(path)
if path_exist == False:
os.makedirs(path)
return path
防止下载的插画名称重复
# 防止插画名称重复
def Rename(names):
name2 = names
S = 0
for i in range(len(name2)):
for z in range(len(name2)):
if name2[i] == name2[z] and i != z:
name2[z] = name2[z] + '(' + str(S + 2) + ')'
S = S + 1
S = 0
return name2
最后,下载图片
# 下载图片
def ReadImgs(artistId,path):
imgs = GetAllImgs(artistId,headers)
ArtistJSON = GetArtist(artistId,headers)
artistName = ArtistJSON["data"]["name"] #名称
path = Set_path(path)
path = path + artistName
path = Set_path(path)
for img in imgs:
lastion = str(img.url[len(img.url) - 3:]) # 后缀
name = img.title
name = re.sub('[/\\\=\.,\'"\[\]:\|]', '?', name) # 图片名
paths = path + name + '.' + lastion # 总路径
paths_err = path + str(img.id) + '.' + lastion # 出错路径
print('New thread:' + name)
t = threading.Thread(target=Download(img,paths,paths_err))
t.start()
这是下载的线程函数
#下载的线程
def Download(img,path,errorPath):
headers = {
"referer":"https://pixivic.com/illusts/"+ str(img.id) + "?VNK=8ff14730",
"user-agent":UserAgent().random
}
global error
try:
rep = requests.get(img.url,headers=headers)
if rep.status_code == 200:
with open(path,"wb")as f:
f.write(rep.content)
print("DownLoadSuccessfully---" + img.title)
else:
print("DownLoad Failed" + str(rep.status_code))
error += 1
except Exception as err:
if err == NameError and rep.status_code == 200:
with open(errorPath,"wb")as f:
f.write(rep.content)
print("DownLoadSuccessfully---" + img.title)
else:
error += 1
print(err)
到此p站爬虫算是完成了,运行一下看看效果
上述的也仅仅是个人的观点,发发博客的同时也见证一下自己学习的历程,如果能对你有帮助也是再好不过了,最后,下面是成品和源代码
全部评论 (暂无评论)
info 还没有任何评论,你来说两句呐!