# -*- coding:utf-8 -*-
import requests
import json
import re
from bs4 import BeautifulSoup
import re
import os
from urllib import parse
support_format = ["mp3", "wav", 'ape']
songDict = {}
lrc_list = []
singerDict = {}
songUrlMap = {}
page_list=[]
g_find_num=0
def get_song_info(song_dir, lrc_dir):
'''
获取指定目录dir下的歌曲文件名称(格式为 歌手 - 歌曲名称.mp3),将歌曲名成保存为列表形式,在生成歌词文件时使用,同时生成一个{歌手:[歌曲名1,歌曲名2]}
的一个map.
'''
#获取文件名
song_files = os.listdir(song_dir)
lrc_files = os.listdir(lrc_dir)
#待爬取的歌曲信息
songDict = {} # songDict{songName:fileName}
singerDict = {} # singerDict{singer: [songName]}
for lrc_file in lrc_files:
lrc_list.append(lrc_file.split('.')[0])
print(lrc_list)
for song_file in song_files:
songInfo = re.split(r'[-.]', song_file)
singer = songInfo[0].strip()
songName = songInfo[1].strip()
songFormat = songInfo[2].strip()
songDict[songName] = song_file.strip()
#没有lrc文件时才进行爬取
if song_file.split('.')[0] not in lrc_list:
if singer not in songUrlMap.keys():
songUrlMap[singer] = {}
if songFormat in support_format and singer in singerDict:
singerDict[singer].append(songName)
else:
singerDict[singer] = [songName]
return songDict, singerDict
def get_singer_url(singer, page_num=1):
'''
根据歌手搜索所该网站下的所有歌手页面。
入参1 singer :歌手名
入参2 page_num:第几个页面
返回值 :该歌手的全部页面map {singer: [url1, url2]}
函数说明 :该函数会递归调用,记录给定歌手的所有url
'''
#获取页面url
singer_url = "https://www.90lrc.cn/so.php?wd="+parse.quote(singer)+"&page="+str(page_num)
print(singer_url)
#获取页面
lyric = requests.get(singer_url)
lyric.encoding = 'utf-8'
soup = BeautifulSoup(lyric.text,'lxml')
return get_song_url(singer, singer_url)
def get_song_url(singer, singer_url):
'''
在给定的页面获取页中的歌词页面url
'''
print("get_song_url(" + singer + "," + singer_url + ")")
lyric = requests.get(singer_url)
lyric.encoding = 'utf-8'
soup = BeautifulSoup(lyric.text,'lxml')
#查看本页面歌曲,并给出歌曲列表
page = soup.find_all(attrs={'class':'ss'})
class_num = 0
for each in page:
class_num = class_num + 1;
li = each.find_all('li')
for each_li in li:
if str(each_li.string) == "暂无记录":
return False
if str(each_li).startswith("<li>歌词名称"):
ss_songUrl = each_li.a['href'] #歌词url
ss_songName = each_li.a.string #歌词名称
lrc_url = "https://www.90lrc.cn" + str(ss_songUrl)
if ss_songName in singerDict[singer]:
if ss_songName not in songUrlMap[singer].keys():
songUrlMap[singer][ss_songName] = lrc_url
print("find "+ ss_songName + " " + lrc_url)
return True
def lrc_crawler(lrc_url, lrc_dir):
'''
在给定的歌词页面提取歌词,并生成歌词文件
'''
lyric = requests.get(lrc_url)
if lyric.ok:
soup = BeautifulSoup(lyric.text,'lxml')
text = re.split(r'[\r\n]', str(soup.text))
if "出错了" not in text:
'''标签查找'''
web_title = soup.title # 只是查找出第一个title
if web_title:
web_songName = web_title.string.split("Mp3")[0].strip()
if web_songName in songDict:
print("songDict["+web_songName+"] : " + songDict[web_songName])
lrc_fileName = './lrc/' + str(songDict[web_songName]).lower().replace(".mp3", ".lrc")
print("lrc_fileName : " + lrc_fileName)
lrc_file = open(lrc_fileName, 'w', encoding='utf-8')
for line in text:
if line.startswith("[0"):
lrc_file.write(line+"\n")
lrc_file.close()
find_num = g_find_num
find_num = find_num + 1
print(web_songName + " ==============> " + str(find_num) + " " + lrc_fileName)
if __name__ == "__main__":
'''
使用要求: 1)songDir=你的音乐文件夹路径, lrcDir=lrc歌词文件生成路径
2)歌曲名格式 歌手 - 歌名.mp3
'''
songDir = "K:/个人学习/全部音乐/";
lrcDir = "./lrc/"
songDict, singerDict = get_song_info(songDir, lrcDir)
for singer in singerDict.keys():
page_num = 1
while get_singer_url(singer, page_num):
page_num = page_num + 1
print(songUrlMap[singer])
for each in songUrlMap[singer].keys():
lrc_crawler(songUrlMap[singer][each], lrcDir)
来源:CSDN
作者:heima9000
链接:https://blog.csdn.net/heima9000/article/details/104064223