MythTV master
lyricsScraper.py
Go to the documentation of this file.
1#-*- coding: UTF-8 -*-
2'''
3Scraper for http://www.darklyrics.com/ - the largest metal lyrics archive on the Web.
4
5scraper by smory
6'''
7
8import hashlib
9import math
10import requests
11import time
12import urllib.parse
13import re
14from lib.utils import *
15try:
16 from ctypes import c_int32 # ctypes not supported on xbox
17except:
18 pass
19
20__title__ = 'darklyrics'
21__priority__ = '260'
22__lrc__ = False
23
24
26 def __init__(self, *args, **kwargs):
27 self.DEBUG = kwargs['debug']
28 self.settings = kwargs['settings']
29 self.base_url = 'http://www.darklyrics.com/'
30 self.searchUrl = 'http://www.darklyrics.com/search?q=%s'
31 self.cookie = self.getCookie()
32
33 def getCookie(self):
34 # http://www.darklyrics.com/tban.js
35 lastvisitts = 'Nergal' + str(math.ceil(time.time() * 1000 / (60 * 60 * 6 * 1000)))
36 lastvisittscookie = 0
37 i = 0
38 while i < len(lastvisitts):
39 try:
40 lastvisittscookie = c_int32((c_int32(lastvisittscookie<<5).value - c_int32(lastvisittscookie).value) + ord(lastvisitts[i])).value
41 except:
42 return
43 i += 1
44 lastvisittscookie = lastvisittscookie & lastvisittscookie
45 return str(lastvisittscookie)
46
47 def search(self, artist, title):
48 term = urllib.parse.quote((artist if artist else '') + '+' + (title if title else ''))
49 try:
50 headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
51 req = requests.get(self.searchUrl % term, headers=headers, cookies={'lastvisitts': self.cookie}, timeout=10)
52 searchResponse = req.text
53 except:
54 return None
55 searchResult = re.findall('<h2><a\shref="(.*?#([0-9]+))".*?>(.*?)</a></h2>', searchResponse)
56 if len(searchResult) == 0:
57 return None
58 links = []
59 i = 0
60 for result in searchResult:
61 a = []
62 a.append(result[2] + (' ' + self.getAlbumName(self.base_url + result[0]) if i < 6 else '')) # title from server + album nane
63 a.append(self.base_url + result[0]) # url with lyrics
64 a.append(artist)
65 a.append(title)
66 a.append(result[1]) # id of the side part containing this song lyrics
67 links.append(a)
68 i += 1
69 return links
70
71 def findLyrics(self, url, index):
72 try:
73 headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
74 req = requests.get(url, headers=headers, cookies={'lastvisitts': self.cookie}, timeout=10)
75 res = req.text
76 except:
77 return None
78 pattern = '<a\sname="%index%">(.*?)(?:<h3>|<div)' # require multi line and dot all mode
79 pattern = pattern.replace('%index%', index)
80 match = re.search(pattern, res, re.MULTILINE | re.DOTALL)
81 if match:
82 s = match.group(1)
83 s = s.replace('<br />', '')
84 s = s.replace('<i>', '')
85 s = s.replace('</i>', '')
86 s = s.replace('</a>', '')
87 s = s.replace('</h3>', '')
88 return s
89 else:
90 return None
91
92 def getAlbumName(self, url):
93 try:
94 headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
95 req = requests.get(url, headers=headers, cookies={'lastvisitts': self.cookie}, timeout=10)
96 res = req.text
97 except:
98 return ''
99 match = re.search('<h2>(?:album|single|ep|live):?\s?(.*?)</h2>', res, re.IGNORECASE)
100 if match:
101 return ('(' + match.group(1) + ')').replace('\'', '')
102 else:
103 return ''
104
105 def get_lyrics(self, song):
106 log('%s: searching lyrics for %s - %s' % (__title__, song.artist, song.title), debug=self.DEBUG)
107 lyrics = Lyrics(settings=self.settings)
108 lyrics.song = song
109 lyrics.source = __title__
110 lyrics.lrc = __lrc__
111 links = self.search(song.artist , song.title)
112 if(links == None or len(links) == 0):
113 return None
114 elif len(links) > 1:
115 lyrics.list = links
116 lyr = self.get_lyrics_from_list(links[0])
117 if not lyr:
118 return None
119 lyrics.lyrics = lyr
120 return lyrics
121
122 def get_lyrics_from_list(self, link):
123 title, url, artist, song, index = link
124 return self.findLyrics(url, index)
None log(str msg, int level=LOGDEBUG)
Definition: xbmc.py:9