MythTV master
lyricsScraper.py
Go to the documentation of this file.
1#-*- coding: UTF-8 -*-
2'''
3Scraper for https://www.musixmatch.com/
4
5musixmatch
6'''
7
8import os
9import requests
10import re
11import random
12import difflib
13import html
14from bs4 import BeautifulSoup
15from lib.utils import *
16
17__title__ = "musixmatch"
18__priority__ = '210'
19__lrc__ = False
20
21headers = {}
22headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
23
24# search is not possible as it requires javascript, only direct access to the lyrics work.
25
27 def __init__(self, *args, **kwargs):
28 self.DEBUG = kwargs['debug']
29 self.settings = kwargs['settings']
30 self.SEARCH_URL = 'https://www.musixmatch.com/search?query='
31 self.LYRIC_URL = 'https://www.musixmatch.com/lyrics/%s/%s'
32
33 def get_lyrics(self, song):
34 log("%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title), debug=self.DEBUG)
35 lyrics = Lyrics(settings=self.settings)
36 lyrics.song = song
37 lyrics.source = __title__
38 lyrics.lrc = __lrc__
39 artist = song.artist.replace("'", '').replace('!', '').replace('?', '').replace('"', '').replace('/', '').replace('.', '').replace('&', '').replace(',', '').replace('(', '').replace(')', '').replace(' ', '-')
40 title = song.title.replace("'", '').replace('!', '').replace('?', '').replace('"', '').replace('/', '').replace('.', '').replace('&', '').replace(',', '').replace('(', '').replace(')', '').replace(' ', '-')
41 url = self.LYRIC_URL % (artist, title)
42 try:
43 log('%s: search url: %s' % (__title__, url), debug=self.DEBUG)
44 search = requests.get(url, headers=headers, timeout=10)
45 response = search.text
46 except:
47 return None
48 matchcode = re.search('Lyrics of (.*?)Writer\‍(s\‍): ', response, flags=re.DOTALL)
49 if matchcode:
50 lyricscode = (matchcode.group(1))
51 lyr = re.sub('<[^<]+?>', '\n', lyricscode)
52 lyr = html.unescape(lyr)
53 lyrics.lyrics = lyr.replace('\n\n\n\n', '\n')
54 return lyrics
55 return None
56
57'''
58 def get_lyrics(self, song):
59 log("%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title), debug=self.DEBUG)
60 lyrics = Lyrics(settings=self.settings)
61 lyrics.song = song
62 lyrics.source = __title__
63 lyrics.lrc = __lrc__
64 artist = song.artist.replace(' ', '+')
65 title = song.title.replace(' ', '+')
66 search = '%s+%s' % (artist, title)
67 try:
68 url = self.SEARCH_URL + search
69 response = requests.get(url, headers=headers, timeout=10)
70 result = response.text
71 except:
72 return None
73 links = []
74 soup = BeautifulSoup(result, 'html.parser')
75 for item in soup.find_all('li', {'class': 'showArtist'}):
76 artistname = item.find('a', {'class': 'artist'}).get_text()
77 songtitle = item.find('a', {'class': 'title'}).get_text()
78 url = item.find('a', {'class': 'title'}).get('href')
79 if (difflib.SequenceMatcher(None, artist.lower(), artistname.lower()).ratio() > 0.8) and (difflib.SequenceMatcher(None, title.lower(), songtitle.lower()).ratio() > 0.8):
80 links.append((artistname + ' - ' + songtitle, self.LYRIC_URL + url, artistname, songtitle))
81 if len(links) == 0:
82 return None
83 elif len(links) > 1:
84 lyrics.list = links
85 for link in links:
86 lyr = self.get_lyrics_from_list(link)
87 if lyr:
88 lyrics.lyrics = lyr
89 return lyrics
90 return None
91
92 def get_lyrics_from_list(self, link):
93 title,url,artist,song = link
94 try:
95 log('%s: search url: %s' % (__title__, url), debug=self.DEBUG)
96 response = requests.get(url, headers=headers, timeout=10)
97 result = response.text
98 except:
99 return None
100 soup = BeautifulSoup(result, 'html.parser')
101 lyr = soup.find_all('span', {'class': 'lyrics__content__ok'})
102 if lyr:
103 lyrics = ''
104 for part in lyr:
105 lyrics = lyrics + part.get_text() + '\n'
106 return lyrics
107 else:
108 lyr = soup.find_all('span', {'class': 'lyrics__content__error'})
109 if lyr:
110 lyrics = ''
111 for part in lyr:
112 lyrics = lyrics + part.get_text() + '\n'
113 return lyrics
114'''
None log(str msg, int level=LOGDEBUG)
Definition: xbmc.py:9