3 Scraper for http://www.darklyrics.com/ - the largest metal lyrics archive on the Web.
17 from ctypes
import c_int32
22 from optparse
import OptionParser
23 from common
import utilities
25 __author__ =
"Paul Harrison and smory"
26 __title__ =
"DarkLyrics"
27 __description__ =
"Search http://www.darklyrics.com/ - the largest metal lyrics archive on the Web"
30 __syncronized__ =
False
38 self.
searchUrl =
"http://www.darklyrics.com/search?q=%s"
42 lastvisitts =
'Nergal' + str(math.ceil(time.time() * 1000 / (60 * 60 * 6 * 1000)))
45 while i < len(lastvisitts):
47 lastvisittscookie = c_int32((c_int32(lastvisittscookie<<5).value - c_int32(lastvisittscookie).value) + ord(lastvisitts[i])).value
51 lastvisittscookie = lastvisittscookie & lastvisittscookie
52 return str(lastvisittscookie)
55 term = urllib.parse.quote((artist
if artist
else '') +
'+' + (title
if title
else ''))
57 headers = {
'user-agent':
'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
58 req = requests.get(self.
searchUrl % term, headers=headers, cookies={
'lastvisitts': self.
cookie}, timeout=10)
59 searchResponse = req.text
63 searchResult = re.findall(
'<h2><a\shref="(.*?#([0-9]+))".*?>(.*?)</a></h2>', searchResponse)
64 if len(searchResult) == 0:
68 for result
in searchResult:
81 headers = {
'user-agent':
'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
82 req = requests.get(url, headers=headers, cookies={
'lastvisitts': self.
cookie}, timeout=10)
86 pattern =
'<a\sname="%index%">(.*?)(?:<h3>|<div)'
87 pattern = pattern.replace(
'%index%', index)
88 match = re.search(pattern, res, re.MULTILINE | re.DOTALL)
91 s = s.replace(
'<br />',
'')
92 s = s.replace(
'<i>',
'')
93 s = s.replace(
'</i>',
'')
94 s = s.replace(
'</a>',
'')
95 s = s.replace(
'</h3>',
'')
102 headers = {
'user-agent':
'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
103 req = requests.get(url, headers=headers, cookies={
'lastvisitts': self.
cookie}, timeout=10)
107 match = re.search(
'<h2>(?:album|single|ep|live):?\s?(.*?)</h2>', res, re.IGNORECASE)
109 return (
'(' + match.group(1) +
')').replace(
'\'',
'')
114 utilities.log(debug,
"%s: searching lyrics for %s - %s - %s" % (__title__, lyrics.artist, lyrics.album, lyrics.title))
115 links = self.
search(lyrics.artist, lyrics.title)
116 if(links ==
None or len(links) == 0):
128 title, url, artist, song, index = link
133 lyrics = utilities.Lyrics()
134 lyrics.source = __title__
135 lyrics.syncronized = __syncronized__
136 lyrics.artist =
'Neurosis'
138 lyrics.title =
'Lost'
141 found = fetcher.get_lyrics(lyrics)
144 utilities.log(
True,
"Everything appears in order.")
148 utilities.log(
True,
"The lyrics for the test search failed!")
152 from lxml
import etree
153 xml = etree.XML(
u'<lyrics></lyrics>')
154 etree.SubElement(xml,
"artist").text = lyrics.artist
155 etree.SubElement(xml,
"album").text = lyrics.album
156 etree.SubElement(xml,
"title").text = lyrics.title
157 etree.SubElement(xml,
"syncronized").text =
'True' if __syncronized__
else 'False'
158 etree.SubElement(xml,
"grabber").text = lyrics.source
160 lines = lyrics.lyrics.splitlines()
162 line2 = re.sub(
u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+',
'', line)
163 etree.SubElement(xml,
"lyric").text = line2
165 utilities.log(
True, utilities.convert_etree(etree.tostring(xml, encoding=
'UTF-8',
166 pretty_print=
True, xml_declaration=
True)))
170 from lxml
import etree
171 version = etree.XML(
u'<grabber></grabber>')
172 etree.SubElement(version,
"name").text = __title__
173 etree.SubElement(version,
"author").text = __author__
174 etree.SubElement(version,
"command").text =
'darklyrics.py'
175 etree.SubElement(version,
"type").text =
'lyrics'
176 etree.SubElement(version,
"description").text = __description__
177 etree.SubElement(version,
"version").text = __version__
178 etree.SubElement(version,
"priority").text = __priority__
179 etree.SubElement(version,
"syncronized").text =
'True' if __syncronized__
else 'False'
181 utilities.log(
True, utilities.convert_etree(etree.tostring(version, encoding=
'UTF-8',
182 pretty_print=
True, xml_declaration=
True)))
188 parser = OptionParser()
190 parser.add_option(
'-v',
"--version", action=
"store_true", default=
False,
191 dest=
"version", help=
"Display version and author")
192 parser.add_option(
'-t',
"--test", action=
"store_true", default=
False,
193 dest=
"test", help=
"Test grabber with a know good search")
194 parser.add_option(
'-s',
"--search", action=
"store_true", default=
False,
195 dest=
"search", help=
"Search for lyrics.")
196 parser.add_option(
'-a',
"--artist", metavar=
"ARTIST", default=
None,
197 dest=
"artist", help=
"Artist of track.")
198 parser.add_option(
'-b',
"--album", metavar=
"ALBUM", default=
None,
199 dest=
"album", help=
"Album of track.")
200 parser.add_option(
'-n',
"--title", metavar=
"TITLE", default=
None,
201 dest=
"title", help=
"Title of track.")
202 parser.add_option(
'-f',
"--filename", metavar=
"FILENAME", default=
None,
203 dest=
"filename", help=
"Filename of track.")
204 parser.add_option(
'-d',
'--debug', action=
"store_true", default=
False,
205 dest=
"debug", help=(
"Show debug messages"))
206 opts, args = parser.parse_args()
208 lyrics = utilities.Lyrics()
209 lyrics.source = __title__
210 lyrics.syncronized = __syncronized__
222 lyrics.artist = opts.artist
224 lyrics.album = opts.album
226 lyrics.title = opts.title
228 lyrics.filename = opts.filename
231 if fetcher.get_lyrics(lyrics):
235 utilities.log(
True,
"No lyrics found for this track")
238 if __name__ ==
'__main__':