3 Scraper for http://www.darklyrics.com/ - the largest metal lyrics archive on the Web.
10 from urllib2
import quote, urlopen
12 from urllib.request
import urlopen
13 from urllib.parse
import quote
17 from optparse
import OptionParser
18 from common
import utilities
20 __author__ =
"Paul Harrison and smory'"
21 __title__ =
"DarkLyrics"
22 __description__ =
"Search http://www.darklyrics.com/ - the largest metal lyrics archive on the Web"
25 __syncronized__ =
False
33 self.
searchUrl =
"http://www.darklyrics.com/search?q=%term%"
36 term = quote((artist
if artist
else "") +
" " + (title
if title
else ""))
39 request = urlopen(self.
searchUrl.replace(
"%term%", term))
40 searchResponse = request.read()
44 searchResult = re.findall(b
"<h2><a\shref=\"(.*?#([0-9]+))\".*?>(.*?)</a></h2>", searchResponse)
46 if len(searchResult) == 0:
52 for result
in searchResult:
66 request = urlopen(url)
71 pattern = b
"<a\sname=\"%index%\">(.*?)(?:<h3>|<div)"
72 pattern = pattern.replace(b
"%index%", index)
74 match = re.search(pattern, res, re.MULTILINE | re.DOTALL)
77 s = s.replace(b
"<br />", b
"")
78 s = s.replace(b
"<i>", b
"")
79 s = s.replace(b
"</i>", b
"")
80 s = s.replace(b
"</a>", b
"")
81 s = s.replace(b
"</h3>", b
"")
88 request = urlopen(url)
93 match = re.search(b
"<h2>(?:album|single|ep|live):?\s?(.*?)</h2>", res, re.IGNORECASE)
96 ret = (b
"(" + match.group(1) + b
")").replace(b
"\"", b
"")
103 utilities.log(debug,
"%s: searching lyrics for %s - %s - %s" % (__title__, lyrics.artist, lyrics.album, lyrics.title))
104 links = self.
search(lyrics.artist, lyrics.title)
106 if(links ==
None or len(links) == 0):
115 enc = chardet.detect(lyr)
116 lyr = lyr.decode(enc[
'encoding'],
'ignore')
121 title, url, artist, song, index = link
126 lyrics = utilities.Lyrics()
127 lyrics.source = __title__
128 lyrics.syncronized = __syncronized__
129 lyrics.artist =
'Dagon'
130 lyrics.album =
'Terraphobic'
131 lyrics.title =
'Cut To The Heart'
134 found = fetcher.get_lyrics(lyrics)
137 utilities.log(
True,
"Everything appears in order.")
141 utilities.log(
True,
"The lyrics for the test search failed!")
145 from lxml
import etree
146 xml = etree.XML(
u'<lyrics></lyrics>')
147 etree.SubElement(xml,
"artist").text = lyrics.artist
148 etree.SubElement(xml,
"album").text = lyrics.album
149 etree.SubElement(xml,
"title").text = lyrics.title
150 etree.SubElement(xml,
"syncronized").text =
'True' if __syncronized__
else 'False'
151 etree.SubElement(xml,
"grabber").text = lyrics.source
153 lines = lyrics.lyrics.splitlines()
155 line2 = re.sub(
u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+',
'', line)
156 etree.SubElement(xml,
"lyric").text = line2
158 utilities.log(
True, utilities.convert_etree(etree.tostring(xml, encoding=
'UTF-8',
159 pretty_print=
True, xml_declaration=
True)))
163 from lxml
import etree
164 version = etree.XML(
u'<grabber></grabber>')
165 etree.SubElement(version,
"name").text = __title__
166 etree.SubElement(version,
"author").text = __author__
167 etree.SubElement(version,
"command").text =
'darklyrics.py'
168 etree.SubElement(version,
"type").text =
'lyrics'
169 etree.SubElement(version,
"description").text = __description__
170 etree.SubElement(version,
"version").text = __version__
171 etree.SubElement(version,
"priority").text = __priority__
172 etree.SubElement(version,
"syncronized").text =
'True' if __syncronized__
else 'False'
174 utilities.log(
True, utilities.convert_etree(etree.tostring(version, encoding=
'UTF-8',
175 pretty_print=
True, xml_declaration=
True)))
181 parser = OptionParser()
183 parser.add_option(
'-v',
"--version", action=
"store_true", default=
False,
184 dest=
"version", help=
"Display version and author")
185 parser.add_option(
'-t',
"--test", action=
"store_true", default=
False,
186 dest=
"test", help=
"Test grabber with a know good search")
187 parser.add_option(
'-s',
"--search", action=
"store_true", default=
False,
188 dest=
"search", help=
"Search for lyrics.")
189 parser.add_option(
'-a',
"--artist", metavar=
"ARTIST", default=
None,
190 dest=
"artist", help=
"Artist of track.")
191 parser.add_option(
'-b',
"--album", metavar=
"ALBUM", default=
None,
192 dest=
"album", help=
"Album of track.")
193 parser.add_option(
'-n',
"--title", metavar=
"TITLE", default=
None,
194 dest=
"title", help=
"Title of track.")
195 parser.add_option(
'-f',
"--filename", metavar=
"FILENAME", default=
None,
196 dest=
"filename", help=
"Filename of track.")
197 parser.add_option(
'-d',
'--debug', action=
"store_true", default=
False,
198 dest=
"debug", help=(
"Show debug messages"))
199 opts, args = parser.parse_args()
201 lyrics = utilities.Lyrics()
202 lyrics.source = __title__
203 lyrics.syncronized = __syncronized__
215 lyrics.artist = opts.artist
217 lyrics.album = opts.album
219 lyrics.title = opts.title
221 lyrics.filename = opts.filename
224 if fetcher.get_lyrics(lyrics):
228 utilities.log(
True,
"No lyrics found for this track")
231 if __name__ ==
'__main__':