MythTV master
lyricsScraper.py
Go to the documentation of this file.
1#-*- coding: UTF-8 -*-
2'''
3Scraper for https://www.lyricsify.com/
4'''
5
6import requests
7import re
8import difflib
9from bs4 import BeautifulSoup
10from lib.utils import *
11
12__title__ = "Lyricsify"
13__priority__ = '130'
14__lrc__ = True
15
16UserAgent = {"Host": "www.lyricsify.com", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br, zstd", "DNT": "1", "Alt-Used": "www.lyricsify.com", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Priority": "u=1"}
17
18# lyricsify uses captcha's & cloudflare protection for the search option, only direct lyrics access works
19
21 def __init__(self, *args, **kwargs):
22 self.DEBUG = kwargs['debug']
23 self.settings = kwargs['settings']
24 self.SEARCH_URL = 'https://www.lyricsify.com/lyrics/%s/%s'
25 self.LYRIC_URL = 'https://www.lyricsify.com%s'
26
27 def get_lyrics(self, song):
28 log("%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title), debug=self.DEBUG)
29 lyrics = Lyrics(settings=self.settings)
30 lyrics.song = song
31 lyrics.source = __title__
32 lyrics.lrc = __lrc__
33 artist = song.artist.replace("'", '').replace('!', '').replace('?', '').replace('"', '').replace('/', '').replace('.', '').replace('&', '').replace(',', '').replace('(', '').replace(')', '').replace(' ', '-')
34 title = song.title.replace("'", '').replace('!', '').replace('?', '').replace('"', '').replace('/', '').replace('.', '').replace('&', '').replace(',', '').replace('(', '').replace(')', '').replace(' ', '-')
35 url = self.SEARCH_URL % (artist.lower(), title.lower())
36 try:
37 log('%s: search url: %s' % (__title__, url), debug=self.DEBUG)
38 search = requests.get(url, headers=UserAgent, timeout=10)
39 response = search.text
40 except:
41 return None
42 matchcode = re.search('details">(.*?)</div', response, flags=re.DOTALL)
43 if matchcode:
44 lyricscode = (matchcode.group(1))
45 lyr = re.sub('<[^<]+?>', '', lyricscode)
46 lyrics.lyrics = lyr
47 return lyrics
48 return None
49
50'''
51 def get_lyrics(self, song):
52 log("%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title), debug=self.DEBUG)
53 lyrics = Lyrics(settings=self.settings)
54 lyrics.song = song
55 lyrics.source = __title__
56 lyrics.lrc = __lrc__
57 artist = song.artist.replace(' ', '-')
58 title = song.title.replace(' ', '-')
59 try:
60 url = self.SEARCH_URL % (artist, title)
61 search = requests.get(url, headers=UserAgent, timeout=10)
62 response = search.text
63 except:
64 return None
65 links = []
66 soup = BeautifulSoup(response, 'html.parser')
67 for link in soup.find_all('a'):
68 if link.string and link.get('href').startswith('/lrc/'):
69 foundartist = link.string.split(' - ', 1)[0]
70 # some links don't have a proper 'artist - title' format
71 try:
72 foundsong = link.string.split(' - ', 1)[1].rstrip('.lrc')
73 except:
74 continue
75 if (difflib.SequenceMatcher(None, artist.lower(), foundartist.lower()).ratio() > 0.8) and (difflib.SequenceMatcher(None, title.lower(), foundsong.lower()).ratio() > 0.8):
76 links.append((foundartist + ' - ' + foundsong, self.LYRIC_URL % link.get('href'), foundartist, foundsong))
77 if len(links) == 0:
78 return None
79 elif len(links) > 1:
80 lyrics.list = links
81 for link in links:
82 lyr = self.get_lyrics_from_list(link)
83 if lyr:
84 lyrics.lyrics = lyr
85 return lyrics
86 return None
87
88 def get_lyrics_from_list(self, link):
89 title,url,artist,song = link
90 try:
91 log('%s: search url: %s' % (__title__, url), debug=self.DEBUG)
92 search = requests.get(url, headers=UserAgent, timeout=10)
93 response = search.text
94 except:
95 return None
96 matchcode = re.search('/h3>(.*?)</div', response, flags=re.DOTALL)
97 if matchcode:
98 lyricscode = (matchcode.group(1))
99 cleanlyrics = re.sub('<[^<]+?>', '', lyricscode)
100 return cleanlyrics
101'''
def get_lyrics_from_list(self, link)
def __init__(self, *args, **kwargs)
def get_lyrics(self, song)
None log(str msg, int level=LOGDEBUG)
Definition: xbmc.py:9