3 Scraper for http://www.darklyrics.com/ - the largest metal lyrics archive on the Web.
10 from urllib.request
import Request, urlopen
11 from urllib.parse
import quote
16 from optparse
import OptionParser
17 from common
import utilities
19 __author__ =
"Paul Harrison and smory'"
20 __title__ =
"DarkLyrics"
21 __description__ =
"Search http://www.darklyrics.com/ - the largest metal lyrics archive on the Web"
24 __syncronized__ =
False
32 self.
searchUrl =
"http://www.darklyrics.com/search?q=%term%"
37 lastvisitts = str(int(math.ceil(time.time() * 1000 / (60 * 60 * 6 * 1000))))
39 for i
in range(len(lastvisitts)):
40 lastvisittscookie = ((lastvisittscookie << 5) - lastvisittscookie) + ord(lastvisitts[i])
41 lastvisittscookie = lastvisittscookie & lastvisittscookie
42 return str(lastvisittscookie)
45 term = quote((artist
if artist
else "") +
" " + (title
if title
else ""))
48 headers = {
'user-agent':
'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
49 cookies={
'lastvisitts': self.
cookie}
51 request.add_header(
'User-Agent', headers[
'user-agent'])
52 request.add_header(
"Cookie",
"lastvisitts=%s"% self.
cookie)
53 content = urlopen(request, timeout=10)
54 searchResponse = content.read()
58 searchResult = re.findall(rb
"<h2><a\shref=\"(.*?#([0-9]+))\".*?>(.*?)</a></h2>", searchResponse)
60 if len(searchResult) == 0:
66 for result
in searchResult:
80 request = urlopen(url)
85 pattern = rb
"<a\sname=\"%index%\">(.*?)(?:<h3>|<div)"
86 pattern = pattern.replace(b
"%index%", index)
88 match = re.search(pattern, res, re.MULTILINE | re.DOTALL)
91 s = s.replace(b
"<br />", b
"")
92 s = s.replace(b
"<i>", b
"")
93 s = s.replace(b
"</i>", b
"")
94 s = s.replace(b
"</a>", b
"")
95 s = s.replace(b
"</h3>", b
"")
102 request = urlopen(url)
107 match = re.search(rb
"<h2>(?:album|single|ep|live):?\s?(.*?)</h2>", res, re.IGNORECASE)
110 ret = (b
"(" + match.group(1) + b
")").replace(b
"\"", b
"")
117 utilities.log(debug,
"%s: searching lyrics for %s - %s - %s" % (__title__, lyrics.artist, lyrics.album, lyrics.title))
118 links = self.
search(lyrics.artist, lyrics.title)
120 if(links ==
None or len(links) == 0):
129 enc = chardet.detect(lyr)
130 lyr = lyr.decode(enc[
'encoding'],
'ignore')
135 title, url, artist, song, index = link
140 lyrics = utilities.Lyrics()
141 lyrics.source = __title__
142 lyrics.syncronized = __syncronized__
143 lyrics.artist =
'Dagon'
144 lyrics.album =
'Terraphobic'
145 lyrics.title =
'Cut To The Heart'
148 found = fetcher.get_lyrics(lyrics)
151 utilities.log(
True,
"Everything appears in order.")
155 utilities.log(
True,
"The lyrics for the test search failed!")
159 from lxml
import etree
160 xml = etree.XML(
u'<lyrics></lyrics>')
161 etree.SubElement(xml,
"artist").text = lyrics.artist
162 etree.SubElement(xml,
"album").text = lyrics.album
163 etree.SubElement(xml,
"title").text = lyrics.title
164 etree.SubElement(xml,
"syncronized").text =
'True' if __syncronized__
else 'False'
165 etree.SubElement(xml,
"grabber").text = lyrics.source
167 lines = lyrics.lyrics.splitlines()
169 line2 = re.sub(
u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+',
'', line)
170 etree.SubElement(xml,
"lyric").text = line2
172 utilities.log(
True, utilities.convert_etree(etree.tostring(xml, encoding=
'UTF-8',
173 pretty_print=
True, xml_declaration=
True)))
177 from lxml
import etree
178 version = etree.XML(
u'<grabber></grabber>')
179 etree.SubElement(version,
"name").text = __title__
180 etree.SubElement(version,
"author").text = __author__
181 etree.SubElement(version,
"command").text =
'darklyrics.py'
182 etree.SubElement(version,
"type").text =
'lyrics'
183 etree.SubElement(version,
"description").text = __description__
184 etree.SubElement(version,
"version").text = __version__
185 etree.SubElement(version,
"priority").text = __priority__
186 etree.SubElement(version,
"syncronized").text =
'True' if __syncronized__
else 'False'
188 utilities.log(
True, utilities.convert_etree(etree.tostring(version, encoding=
'UTF-8',
189 pretty_print=
True, xml_declaration=
True)))
195 parser = OptionParser()
197 parser.add_option(
'-v',
"--version", action=
"store_true", default=
False,
198 dest=
"version", help=
"Display version and author")
199 parser.add_option(
'-t',
"--test", action=
"store_true", default=
False,
200 dest=
"test", help=
"Test grabber with a know good search")
201 parser.add_option(
'-s',
"--search", action=
"store_true", default=
False,
202 dest=
"search", help=
"Search for lyrics.")
203 parser.add_option(
'-a',
"--artist", metavar=
"ARTIST", default=
None,
204 dest=
"artist", help=
"Artist of track.")
205 parser.add_option(
'-b',
"--album", metavar=
"ALBUM", default=
None,
206 dest=
"album", help=
"Album of track.")
207 parser.add_option(
'-n',
"--title", metavar=
"TITLE", default=
None,
208 dest=
"title", help=
"Title of track.")
209 parser.add_option(
'-f',
"--filename", metavar=
"FILENAME", default=
None,
210 dest=
"filename", help=
"Filename of track.")
211 parser.add_option(
'-d',
'--debug', action=
"store_true", default=
False,
212 dest=
"debug", help=(
"Show debug messages"))
213 opts, args = parser.parse_args()
215 lyrics = utilities.Lyrics()
216 lyrics.source = __title__
217 lyrics.syncronized = __syncronized__
229 lyrics.artist = opts.artist
231 lyrics.album = opts.album
233 lyrics.title = opts.title
235 lyrics.filename = opts.filename
238 if fetcher.get_lyrics(lyrics):
242 utilities.log(
True,
"No lyrics found for this track")
245 if __name__ ==
'__main__':