爬取酷狗繁星的MV
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from requests import get
import re
import json
from os import path
from time import sleep
import time
from sys import stdout
import requests
def write_json(id,obj):
"""
写入json
:param id: 传入作者的酷狗UserId
:param obj: 传入的josn数据
:return len_json: 返回json的长度
"""
item_list = []
if path.exists(f"{id}.json"):
# 读取出json中的数据
with open(f"{id}.json",'r',encoding='utf-8') as f:
load_dict = json.load(f)
# 循环读取内容,放入列表中
for i in load_dict:
if i in item_list:
pass
else:
item_list.append(i)
# 将新加入的内容也加入列表
# cunt 计数,如果爬取的重复超过10条,就停止。
cunt = 0
retl = 0 # 返回爬取状态 1 代表有重复
for x in obj:
if x in item_list:
cunt += 1
if cunt == 10:
print("重复爬取已经有10条了,停止爬取。。。")
retl = 1
break
else:
item_list.append(x)
# 将追加的内容写入进json
# len_json = len(item_list)
with open(f"{id}.json",'w',encoding='utf-8') as f2:
json.dump(item_list,f2,ensure_ascii=False)
return retl
def get_json(id):
"""
读取json,并返回json对象
:param id: 传入酷狗用户的UserId
:return: 返回json对象
"""
with open(f"{id}.json",'r',encoding='utf-8') as f:
load_json = json.load(f)
len_json = len(load_json)
print(f"当前id:{id} 已经储存 {len_json} 条数据。")
return load_json
def down_jpg(url):
"""
将MV的视频封面下载下来
:param url: 传入的图片网址
:return: 返回储存的位置
"""
dist = url.split("/")[3]
dist = "pics/" + dist
if not path.exists(dist):
jpg_request = get(url)
with open(dist,"wb") as f:
f.write(jpg_request.content)
sleep(0.3)
return dist
def down_mp4(dist,url,name,title):
"""
将MV的视频MP4保存下来
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
}
requests.packages.urllib3.disable_warnings()
# 防止https报错
req = get(url, headers=headers, stream=True,verify=False,allow_redirects=True)
if not path.exists(f'{dist}/{name}'):
file_size = int(req.headers['content-length'])
print(f"获取视频总长度:{file_size}")
with open(f'{dist}/{name}', 'wb') as f:
dl = 0
for chunk in req.iter_content(chunk_size=4096):
if chunk:
dl += len(chunk)
f.write(chunk)
done = int(50 * dl / file_size)
timeArray = time.localtime(int(time.time()))
times = time.strftime("%H:%M:%S", timeArray)
stdout.write("\r[%s%s]%.2f/%.2f M %s" % (
'=' * done, ' ' * (50 - done), dl / float(1024 * 1024), file_size / float(1024 * 1024), times))
stdout.flush()
print("\n")
else:
print(f'{title} 已经存在')
def get_kugou(id,page=1):
"""
爬取所有的MV信息,并调用write_json保存到json中
:param id: 传入酷狗用户的UserId
:param page: 传入想要爬取的页数,每页默认20
:return:
"""
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
}
pattern_obj = re.compile(r".*?back\((.*?)\);")
# pattern_obj_2 = re.compile(r".*?json\((.*?)\);")
for i in range(1,page):
url = "https://fx.service.kugou.com/NServices/Video/OfflineVideoService/getVideoList?args=[%22{}%22,{},0,20]&jsonpcallback=jsonphttpsfxservicekugoucomNServicesVideoOfflineVideoServicegetVideoListargs29640634946020jsonpcallback".format(id,i)
# url_2 = "https://fx.service.kugou.com/NServices/Video/OfflineVideoService/getVideoList?args=[%22{}%22,{},2,20]&jsonpcallback=json".format(id,i)
response = get(url,headers=header)
json_text = response.text
# 截取出返回值中的json段
json_text = re.findall(pattern_obj,json_text)[0]
# 将文本转换为json对象
json_text = json.loads(json_text)
json_list = json_text["data"]["list"]
print(i,json_list)
if len(json_list) == 0:
print("已经爬取了:", i*20)
print("已经结束了!")
break
# 将list字段写入json
retl = write_json(id,json_list)
if retl == 1:
print("已经爬取完最新的MV。。。")
break
sleep(1)
return id
def down_mv_jpg(obj):
"""
传入所有的json对象,调用down_jpg下载,默认没0.3秒下载一张
:param obj: 传入的json对象
:return:
"""
for i in obj:
url = i["imgUrl"]
title = i["title"]
disc = down_jpg(url)
print(f"\r {title} 已经保存到 {disc} 目录中。",end="")
print("\n所有图片已经保存完毕。。。")
def down_mv_mp4(obj):
"""
传入所有的json,调用down_mp4下载。
"""
a = 0
for i in obj:
a+=1
id = i["id"]
hashValue = i["hashValue"]
if path.exists(f"mp4/{hashValue}"):
print(f"\r {a} {id} 已经下过。。。",end="")
continue
response = get(f"https://fx.service.kugou.com/mvcenter/bss/mvInfo?pid=7&videoId={id}&deviceId=")
mp4_json = json.loads(response.text)
dist = "mp4"
url = mp4_json["data"]["videoUrl"]
name = mp4_json["data"]["hashValue"]
title = mp4_json["data"]["title"]
print(f"\r 下载第 {a} 个 {title} 中...",end="")
down_mp4(dist,url,name,title)
sleep(0.2)
print("\n所有MP4已经保存完。。。")
if __name__ == "__main__":
while True:
print("""
欢迎使用屹铭繁星下载器。
1.获取MV信息
2.加载MV信息
3.下载jpg
4.下载mp4
5.退出
""")
commod = int(input("请输入您的指令:"))
if commod == 1:
print("MV爬中中。。。")
get_kugou("296406349",500)
elif commod == 2:
print("加载json中。。。")
mv_json = get_json("296406349")
print("加载json完成")
elif commod == 3:
print("下载jgg中...")
down_mv_jpg(mv_json)
elif commod == 4:
print("下载mp4中...")
down_mv_mp4(mv_json)
else:
print("欢迎下次使用,再见。")
exit(0)
使用前请先修改酷狗id
朕弟分享 | 专注小众,乐于分享! - 支付
本帖含有隐藏内容,请您向作者支付
2金币 后再查看
最后于 2022-8-25
被admin编辑
,原因: 添加购买