MythTV  master
darklyrics.py
Go to the documentation of this file.
1 #-*- coding: UTF-8 -*-
2 """
3 Scraper for http://www.darklyrics.com/ - the largest metal lyrics archive on the Web.
4 
5 scraper by smory
6 """
7 
8 import hashlib
9 import math
10 import requests
11 import urllib.parse
12 import re
13 import time
14 import chardet
15 
16 try:
17  from ctypes import c_int32 # ctypes not supported on xbox
18 except:
19  pass
20 
21 import sys
22 from optparse import OptionParser
23 from common import utilities
24 
25 __author__ = "Paul Harrison and smory"
26 __title__ = "DarkLyrics"
27 __description__ = "Search http://www.darklyrics.com/ - the largest metal lyrics archive on the Web"
28 __priority__ = "260"
29 __version__ = "0.3"
30 __syncronized__ = False
31 
32 debug = False
33 
35 
36  def __init__( self ):
37  self.base_url = "http://www.darklyrics.com/"
38  self.searchUrl = "http://www.darklyrics.com/search?q=%s"
39  self.cookie = self.getCookie()
40  def getCookie(self):
41  # http://www.darklyrics.com/tban.js
42  lastvisitts = 'Nergal' + str(math.ceil(time.time() * 1000 / (60 * 60 * 6 * 1000)))
43  lastvisittscookie = 0
44  i = 0
45  while i < len(lastvisitts):
46  try:
47  lastvisittscookie = c_int32((c_int32(lastvisittscookie<<5).value - c_int32(lastvisittscookie).value) + ord(lastvisitts[i])).value
48  except:
49  return
50  i += 1
51  lastvisittscookie = lastvisittscookie & lastvisittscookie
52  return str(lastvisittscookie)
53 
54  def search(self, artist, title):
55  term = urllib.parse.quote((artist if artist else '') + '+' + (title if title else ''))
56  try:
57  headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
58  req = requests.get(self.searchUrl % term, headers=headers, cookies={'lastvisitts': self.cookie}, timeout=10)
59  searchResponse = req.text
60 
61  except:
62  return None
63  searchResult = re.findall('<h2><a\shref="(.*?#([0-9]+))".*?>(.*?)</a></h2>', searchResponse)
64  if len(searchResult) == 0:
65  return None
66  links = []
67  i = 0
68  for result in searchResult:
69  a = []
70  a.append(result[2] + (' ' + self.getAlbumName(self.base_url + result[0]) if i < 6 else '')) # title from server + album name
71  a.append(self.base_url + result[0]) # url with lyrics
72  a.append(artist)
73  a.append(title)
74  a.append(result[1]) # id of the side part containing this song lyrics
75  links.append(a)
76  i += 1
77  return links
78 
79  def findLyrics(self, url, index):
80  try:
81  headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
82  req = requests.get(url, headers=headers, cookies={'lastvisitts': self.cookie}, timeout=10)
83  res = req.text
84  except:
85  return None
86  pattern = '<a\sname="%index%">(.*?)(?:<h3>|<div)' # require multi line and dot all mode
87  pattern = pattern.replace('%index%', index)
88  match = re.search(pattern, res, re.MULTILINE | re.DOTALL)
89  if match:
90  s = match.group(1)
91  s = s.replace('<br />', '')
92  s = s.replace('<i>', '')
93  s = s.replace('</i>', '')
94  s = s.replace('</a>', '')
95  s = s.replace('</h3>', '')
96  return s
97  else:
98  return None
99 
100  def getAlbumName(self, url):
101  try:
102  headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
103  req = requests.get(url, headers=headers, cookies={'lastvisitts': self.cookie}, timeout=10)
104  res = req.text
105  except:
106  return ''
107  match = re.search('<h2>(?:album|single|ep|live):?\s?(.*?)</h2>', res, re.IGNORECASE)
108  if match:
109  return ('(' + match.group(1) + ')').replace('\'', '')
110  else:
111  return ''
112 
113  def get_lyrics(self, lyrics):
114  utilities.log(debug, "%s: searching lyrics for %s - %s - %s" % (__title__, lyrics.artist, lyrics.album, lyrics.title))
115  links = self.search(lyrics.artist, lyrics.title)
116  if(links == None or len(links) == 0):
117  return False
118  elif len(links) > 1:
119  lyrics.list = links
120  lyr = self.get_lyrics_from_list(links[0])
121  if not lyr:
122  return False
123  lyrics.lyrics = lyr
124  return True
125 
126 
127  def get_lyrics_from_list(self, link):
128  title, url, artist, song, index = link
129  return self.findLyrics(url, index)
130 
132  found = False
133  lyrics = utilities.Lyrics()
134  lyrics.source = __title__
135  lyrics.syncronized = __syncronized__
136  lyrics.artist = 'Neurosis'
137  lyrics.album = ''
138  lyrics.title = 'Lost'
139 
140  fetcher = LyricsFetcher()
141  found = fetcher.get_lyrics(lyrics)
142 
143  if found:
144  utilities.log(True, "Everything appears in order.")
145  buildLyrics(lyrics)
146  sys.exit(0)
147 
148  utilities.log(True, "The lyrics for the test search failed!")
149  sys.exit(1)
150 
151 def buildLyrics(lyrics):
152  from lxml import etree
153  xml = etree.XML(u'<lyrics></lyrics>')
154  etree.SubElement(xml, "artist").text = lyrics.artist
155  etree.SubElement(xml, "album").text = lyrics.album
156  etree.SubElement(xml, "title").text = lyrics.title
157  etree.SubElement(xml, "syncronized").text = 'True' if __syncronized__ else 'False'
158  etree.SubElement(xml, "grabber").text = lyrics.source
159 
160  lines = lyrics.lyrics.splitlines()
161  for line in lines:
162  line2 = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+', '', line)
163  etree.SubElement(xml, "lyric").text = line2
164 
165  utilities.log(True, utilities.convert_etree(etree.tostring(xml, encoding='UTF-8',
166  pretty_print=True, xml_declaration=True)))
167  sys.exit(0)
168 
170  from lxml import etree
171  version = etree.XML(u'<grabber></grabber>')
172  etree.SubElement(version, "name").text = __title__
173  etree.SubElement(version, "author").text = __author__
174  etree.SubElement(version, "command").text = 'darklyrics.py'
175  etree.SubElement(version, "type").text = 'lyrics'
176  etree.SubElement(version, "description").text = __description__
177  etree.SubElement(version, "version").text = __version__
178  etree.SubElement(version, "priority").text = __priority__
179  etree.SubElement(version, "syncronized").text = 'True' if __syncronized__ else 'False'
180 
181  utilities.log(True, utilities.convert_etree(etree.tostring(version, encoding='UTF-8',
182  pretty_print=True, xml_declaration=True)))
183  sys.exit(0)
184 
185 def main():
186  global debug
187 
188  parser = OptionParser()
189 
190  parser.add_option('-v', "--version", action="store_true", default=False,
191  dest="version", help="Display version and author")
192  parser.add_option('-t', "--test", action="store_true", default=False,
193  dest="test", help="Test grabber with a know good search")
194  parser.add_option('-s', "--search", action="store_true", default=False,
195  dest="search", help="Search for lyrics.")
196  parser.add_option('-a', "--artist", metavar="ARTIST", default=None,
197  dest="artist", help="Artist of track.")
198  parser.add_option('-b', "--album", metavar="ALBUM", default=None,
199  dest="album", help="Album of track.")
200  parser.add_option('-n', "--title", metavar="TITLE", default=None,
201  dest="title", help="Title of track.")
202  parser.add_option('-f', "--filename", metavar="FILENAME", default=None,
203  dest="filename", help="Filename of track.")
204  parser.add_option('-d', '--debug', action="store_true", default=False,
205  dest="debug", help=("Show debug messages"))
206  opts, args = parser.parse_args()
207 
208  lyrics = utilities.Lyrics()
209  lyrics.source = __title__
210  lyrics.syncronized = __syncronized__
211 
212  if opts.debug:
213  debug = True
214 
215  if opts.version:
216  buildVersion()
217 
218  if opts.test:
220 
221  if opts.artist:
222  lyrics.artist = opts.artist
223  if opts.album:
224  lyrics.album = opts.album
225  if opts.title:
226  lyrics.title = opts.title
227  if opts.filename:
228  lyrics.filename = opts.filename
229 
230  fetcher = LyricsFetcher()
231  if fetcher.get_lyrics(lyrics):
232  buildLyrics(lyrics)
233  sys.exit(0)
234  else:
235  utilities.log(True, "No lyrics found for this track")
236  sys.exit(1)
237 
238 if __name__ == '__main__':
239  main()
darklyrics.LyricsFetcher.cookie
cookie
Definition: darklyrics.py:39
darklyrics.performSelfTest
def performSelfTest()
Definition: darklyrics.py:131
darklyrics.LyricsFetcher.search
def search(self, artist, title)
Definition: darklyrics.py:54
darklyrics.LyricsFetcher.searchUrl
searchUrl
Definition: darklyrics.py:38
darklyrics.LyricsFetcher.get_lyrics_from_list
def get_lyrics_from_list(self, link)
Definition: darklyrics.py:127
darklyrics.LyricsFetcher.base_url
base_url
Definition: darklyrics.py:37
darklyrics.LyricsFetcher.__init__
def __init__(self)
Definition: darklyrics.py:36
darklyrics.buildLyrics
def buildLyrics(lyrics)
Definition: darklyrics.py:151
darklyrics.main
def main()
Definition: darklyrics.py:185
darklyrics.buildVersion
def buildVersion()
Definition: darklyrics.py:169
darklyrics.LyricsFetcher
Definition: darklyrics.py:34
darklyrics.LyricsFetcher.getAlbumName
def getAlbumName(self, url)
Definition: darklyrics.py:100
darklyrics.LyricsFetcher.get_lyrics
def get_lyrics(self, lyrics)
Definition: darklyrics.py:113
darklyrics.LyricsFetcher.findLyrics
def findLyrics(self, url, index)
Definition: darklyrics.py:79
darklyrics.LyricsFetcher.getCookie
def getCookie(self)
Definition: darklyrics.py:40