3 Scraper for https://www.musixmatch.com/
14 from bs4
import BeautifulSoup
17 __title__ =
"musixmatch"
22 headers[
'User-Agent'] =
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
30 self.
SEARCH_URL =
'https://www.musixmatch.com/search?query='
31 self.
LYRIC_URL =
'https://www.musixmatch.com/lyrics/%s/%s'
34 log(
"%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title), debug=self.
DEBUG)
35 lyrics = Lyrics(settings=self.
settings)
37 lyrics.source = __title__
39 artist = song.artist.replace(
"'",
'').replace(
'!',
'').replace(
'?',
'').replace(
'"',
'').replace(
'/',
'').replace(
'.',
'').replace(
'&',
'').replace(
',',
'').replace(
'(',
'').replace(
')',
'').replace(
' ',
'-')
40 title = song.title.replace(
"'",
'').replace(
'!',
'').replace(
'?',
'').replace(
'"',
'').replace(
'/',
'').replace(
'.',
'').replace(
'&',
'').replace(
',',
'').replace(
'(',
'').replace(
')',
'').replace(
' ',
'-')
43 log(
'%s: search url: %s' % (__title__, url), debug=self.
DEBUG)
44 search = requests.get(url, headers=headers, timeout=10)
45 response = search.text
48 matchcode = re.search(
'Lyrics of (.*?)Writer\(s\): ', response, flags=re.DOTALL)
50 lyricscode = (matchcode.group(1))
51 lyr = re.sub(
'<[^<]+?>',
'\n', lyricscode)
52 lyr = html.unescape(lyr)
53 lyrics.lyrics = lyr.replace(
'\n\n\n\n',
'\n')
58 def get_lyrics(self, song):
59 log("%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title), debug=self.DEBUG)
60 lyrics = Lyrics(settings=self.settings)
62 lyrics.source = __title__
64 artist = song.artist.replace(' ', '+')
65 title = song.title.replace(' ', '+')
66 search = '%s+%s' % (artist, title)
68 url = self.SEARCH_URL + search
69 response = requests.get(url, headers=headers, timeout=10)
70 result = response.text
74 soup = BeautifulSoup(result, 'html.parser')
75 for item in soup.find_all('li', {'class': 'showArtist'}):
76 artistname = item.find('a', {'class': 'artist'}).get_text()
77 songtitle = item.find('a', {'class': 'title'}).get_text()
78 url = item.find('a', {'class': 'title'}).get('href')
79 if (difflib.SequenceMatcher(None, artist.lower(), artistname.lower()).ratio() > 0.8) and (difflib.SequenceMatcher(None, title.lower(), songtitle.lower()).ratio() > 0.8):
80 links.append((artistname + ' - ' + songtitle, self.LYRIC_URL + url, artistname, songtitle))
86 lyr = self.get_lyrics_from_list(link)
92 def get_lyrics_from_list(self, link):
93 title,url,artist,song = link
95 log('%s: search url: %s' % (__title__, url), debug=self.DEBUG)
96 response = requests.get(url, headers=headers, timeout=10)
97 result = response.text
100 soup = BeautifulSoup(result, 'html.parser')
101 lyr = soup.find_all('span', {'class': 'lyrics__content__ok'})
105 lyrics = lyrics + part.get_text() + '\n'
108 lyr = soup.find_all('span', {'class': 'lyrics__content__error'})
112 lyrics = lyrics + part.get_text() + '\n'