forked from NacedWang/163MusicSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlyric_by_music.py
104 lines (94 loc) · 4.47 KB
/
lyric_by_music.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
根据歌曲 ID 获得所有的歌曲所对应的热门评论和歌词
"""
import datetime
import json
import math
import random
import re
import time
from concurrent.futures.process import ProcessPoolExecutor
import requests
from src import sql, redis_util
from src.util.user_agents import agents
class LyricComment(object):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
# 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; _ga=GA1.2.1405085820.1476521280; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; [email protected]|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|[email protected]; JSESSIONID-WYYY=189f31767098c3bd9d03d9b968c065daf43cbd4c1596732e4dcb471beafe2bf0605b85e969f92600064a977e0b64a24f0af7894ca898b696bd58ad5f39c8fce821ec2f81f826ea967215de4d10469e9bd672e75d25f116a9d309d360582a79620b250625859bc039161c78ab125a1e9bf5d291f6d4e4da30574ccd6bbab70b710e3f358f%3A1476594130342; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476588849.1476592408.6; __utmb=94650624.11.10.1476592408; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'DNT': '1',
'Host': 'music.163.com',
'Pragma': 'no-cache',
'Referer': 'http://music.163.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
def saveLyric(self, music_id):
# 获取歌手个人主页
agent = random.choice(agents)
self.headers["User-Agent"] = agent
url = 'http://music.163.com/api/song/lyric?id=' + str(music_id) + '&lv=1&kv=1&tv=1'
# 去redis验证是否爬取过
check = redis_util.checkIfRequest(redis_util.lyricPrefix, str(music_id))
if (check):
print("url:", url, "has request. pass")
return
r = requests.get(url, headers=self.headers)
# 解析
lyricJson = json.loads(r.text)
# 保存redis去重缓存
if (lyricJson['code'] == 200):
redis_util.saveUrl(redis_util.lyricPrefix, str(music_id))
else:
print(url, " request error :", lyricJson)
return
if ('lrc' in lyricJson):
# 把歌词里的时间干掉
regex = re.compile(r'\[.*\]')
finalLyric = re.sub(regex, '', lyricJson['lrc']['lyric']).strip()
# 持久化数据库
try:
sql.insert_lyric(music_id, finalLyric)
except Exception as e:
print(music_id, "insert error", str(e))
else:
print(str(music_id), "has no lyric", lyricJson)
# 请求完成后睡一秒 防作弊
time.sleep(1)
def saveLyricBatch(index):
my_lyric_comment = LyricComment()
offset = 1000 * index
musics = sql.get_music_page(offset, 1000)
print("index:", index, "offset:", offset, "artists :", len(musics), "start")
for item in musics:
try:
my_lyric_comment.saveLyric(item['music_id'])
except Exception as e:
# 打印错误日志
print(item['music_id'], ' internal error : ' + str(e))
# traceback.print_exc()
time.sleep(1)
print("index:", index, "finished")
def lyricSpider():
print("======= 开始爬 歌词 信息 ===========")
startTime = datetime.datetime.now()
print(startTime.strftime('%Y-%m-%d %H:%M:%S'))
# 所有歌手数量
musics_num = sql.get_all_music_num()
# 批次
batch = math.ceil(musics_num.get('num') / 1000.0)
# 构建线程池
# pool = ProcessPoolExecutor(1)
for index in range(0, batch):
saveLyricBatch(index)
# pool.submit(saveLyricBatch, index)
# pool.shutdown(wait=True)
print("======= 结束爬 歌词 信息 ===========")
endTime = datetime.datetime.now()
print(endTime.strftime('%Y-%m-%d %H:%M:%S'))
print("耗时:", (endTime - startTime).seconds, "秒")
# if __name__ == '__main__':
# lyricSpider()