MythTV  master
darklyrics.py
Go to the documentation of this file.
1 #-*- coding: UTF-8 -*-
2 """
3 Scraper for http://www.darklyrics.com/ - the largest metal lyrics archive on the Web.
4 
5 scraper by smory
6 """
7 
8 import hashlib
9 import math
10 from urllib.request import Request, urlopen
11 from urllib.parse import quote
12 import re
13 import time
14 import chardet
15 import sys
16 from optparse import OptionParser
17 from common import utilities
18 
19 __author__ = "Paul Harrison and smory'"
20 __title__ = "DarkLyrics"
21 __description__ = "Search http://www.darklyrics.com/ - the largest metal lyrics archive on the Web"
22 __priority__ = "180"
23 __version__ = "0.2"
24 __syncronized__ = False
25 
26 debug = False
27 
29 
30  def __init__( self ):
31  self.base_url = "http://www.darklyrics.com/"
32  self.searchUrl = "http://www.darklyrics.com/search?q=%term%"
33  self.cookie = self.getCookie()
34 
35  def getCookie(self):
36  # http://www.darklyrics.com/tban.js
37  lastvisitts = str(int(math.ceil(time.time() * 1000 / (60 * 60 * 6 * 1000))))
38  lastvisittscookie = 0
39  for i in range(len(lastvisitts)):
40  lastvisittscookie = ((lastvisittscookie << 5) - lastvisittscookie) + ord(lastvisitts[i])
41  lastvisittscookie = lastvisittscookie & lastvisittscookie
42  return str(lastvisittscookie)
43 
44  def search(self, artist, title):
45  term = quote((artist if artist else "") + " " + (title if title else ""))
46 
47  try:
48  headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
49  cookies={'lastvisitts': self.cookie}
50  request = Request(self.searchUrl.replace("%term%", term))
51  request.add_header('User-Agent', headers['user-agent'])
52  request.add_header("Cookie", "lastvisitts=%s"% self.cookie)
53  content = urlopen(request, timeout=10)
54  searchResponse = content.read()
55  except:
56  return None
57 
58  searchResult = re.findall(rb"<h2><a\shref=\"(.*?#([0-9]+))\".*?>(.*?)</a></h2>", searchResponse)
59 
60  if len(searchResult) == 0:
61  return None
62 
63  links = []
64 
65  i = 0
66  for result in searchResult:
67  a = []
68  a.append(result[2] + ( b" " + self.getAlbumName(self.base_url + result[0].decode('utf-8') )if i < 6 else b"")) # title from server + album nane
69  a.append(self.base_url + result[0].decode('utf-8')) # url with lyrics
70  a.append(artist)
71  a.append(title)
72  a.append(result[1]) # id of the side part containing this song lyrics
73  links.append(a)
74  i += 1
75 
76  return links
77 
78  def findLyrics(self, url, index):
79  try:
80  request = urlopen(url)
81  res = request.read()
82  except:
83  return None
84 
85  pattern = rb"<a\sname=\"%index%\">(.*?)(?:<h3>|<div)" # require multi line and dot all mode
86  pattern = pattern.replace(b"%index%", index)
87 
88  match = re.search(pattern, res, re.MULTILINE | re.DOTALL)
89  if match:
90  s = match.group(1)
91  s = s.replace(b"<br />", b"")
92  s = s.replace(b"<i>", b"")
93  s = s.replace(b"</i>", b"")
94  s = s.replace(b"</a>", b"")
95  s = s.replace(b"</h3>", b"")
96  return s
97  else:
98  return None
99 
100  def getAlbumName(self, url):
101  try:
102  request = urlopen(url)
103  res = request.read()
104  except:
105  return b""
106 
107  match = re.search(rb"<h2>(?:album|single|ep|live):?\s?(.*?)</h2>", res, re.IGNORECASE)
108 
109  if match:
110  ret = (b"(" + match.group(1) + b")").replace(b"\"", b"")
111  else:
112  ret = b""
113  return(ret)
114 
115 
116  def get_lyrics(self, lyrics):
117  utilities.log(debug, "%s: searching lyrics for %s - %s - %s" % (__title__, lyrics.artist, lyrics.album, lyrics.title))
118  links = self.search(lyrics.artist, lyrics.title)
119 
120  if(links == None or len(links) == 0):
121  return False
122  elif len(links) > 1:
123  lyrics.list = links
124 
125  lyr = self.get_lyrics_from_list(links[0])
126  if not lyr:
127  return False
128 
129  enc = chardet.detect(lyr)
130  lyr = lyr.decode(enc['encoding'], 'ignore')
131  lyrics.lyrics = lyr
132  return True
133 
134  def get_lyrics_from_list(self, link):
135  title, url, artist, song, index = link
136  return self.findLyrics(url, index)
137 
139  found = False
140  lyrics = utilities.Lyrics()
141  lyrics.source = __title__
142  lyrics.syncronized = __syncronized__
143  lyrics.artist = 'Dagon'
144  lyrics.album = 'Terraphobic'
145  lyrics.title = 'Cut To The Heart'
146 
147  fetcher = LyricsFetcher()
148  found = fetcher.get_lyrics(lyrics)
149 
150  if found:
151  utilities.log(True, "Everything appears in order.")
152  buildLyrics(lyrics)
153  sys.exit(0)
154 
155  utilities.log(True, "The lyrics for the test search failed!")
156  sys.exit(1)
157 
158 def buildLyrics(lyrics):
159  from lxml import etree
160  xml = etree.XML(u'<lyrics></lyrics>')
161  etree.SubElement(xml, "artist").text = lyrics.artist
162  etree.SubElement(xml, "album").text = lyrics.album
163  etree.SubElement(xml, "title").text = lyrics.title
164  etree.SubElement(xml, "syncronized").text = 'True' if __syncronized__ else 'False'
165  etree.SubElement(xml, "grabber").text = lyrics.source
166 
167  lines = lyrics.lyrics.splitlines()
168  for line in lines:
169  line2 = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+', '', line)
170  etree.SubElement(xml, "lyric").text = line2
171 
172  utilities.log(True, utilities.convert_etree(etree.tostring(xml, encoding='UTF-8',
173  pretty_print=True, xml_declaration=True)))
174  sys.exit(0)
175 
177  from lxml import etree
178  version = etree.XML(u'<grabber></grabber>')
179  etree.SubElement(version, "name").text = __title__
180  etree.SubElement(version, "author").text = __author__
181  etree.SubElement(version, "command").text = 'darklyrics.py'
182  etree.SubElement(version, "type").text = 'lyrics'
183  etree.SubElement(version, "description").text = __description__
184  etree.SubElement(version, "version").text = __version__
185  etree.SubElement(version, "priority").text = __priority__
186  etree.SubElement(version, "syncronized").text = 'True' if __syncronized__ else 'False'
187 
188  utilities.log(True, utilities.convert_etree(etree.tostring(version, encoding='UTF-8',
189  pretty_print=True, xml_declaration=True)))
190  sys.exit(0)
191 
192 def main():
193  global debug
194 
195  parser = OptionParser()
196 
197  parser.add_option('-v', "--version", action="store_true", default=False,
198  dest="version", help="Display version and author")
199  parser.add_option('-t', "--test", action="store_true", default=False,
200  dest="test", help="Test grabber with a know good search")
201  parser.add_option('-s', "--search", action="store_true", default=False,
202  dest="search", help="Search for lyrics.")
203  parser.add_option('-a', "--artist", metavar="ARTIST", default=None,
204  dest="artist", help="Artist of track.")
205  parser.add_option('-b', "--album", metavar="ALBUM", default=None,
206  dest="album", help="Album of track.")
207  parser.add_option('-n', "--title", metavar="TITLE", default=None,
208  dest="title", help="Title of track.")
209  parser.add_option('-f', "--filename", metavar="FILENAME", default=None,
210  dest="filename", help="Filename of track.")
211  parser.add_option('-d', '--debug', action="store_true", default=False,
212  dest="debug", help=("Show debug messages"))
213  opts, args = parser.parse_args()
214 
215  lyrics = utilities.Lyrics()
216  lyrics.source = __title__
217  lyrics.syncronized = __syncronized__
218 
219  if opts.debug:
220  debug = True
221 
222  if opts.version:
223  buildVersion()
224 
225  if opts.test:
227 
228  if opts.artist:
229  lyrics.artist = opts.artist
230  if opts.album:
231  lyrics.album = opts.album
232  if opts.title:
233  lyrics.title = opts.title
234  if opts.filename:
235  lyrics.filename = opts.filename
236 
237  fetcher = LyricsFetcher()
238  if fetcher.get_lyrics(lyrics):
239  buildLyrics(lyrics)
240  sys.exit(0)
241  else:
242  utilities.log(True, "No lyrics found for this track")
243  sys.exit(1)
244 
245 if __name__ == '__main__':
246  main()
darklyrics.LyricsFetcher.cookie
cookie
Definition: darklyrics.py:33
darklyrics.performSelfTest
def performSelfTest()
Definition: darklyrics.py:138
darklyrics.LyricsFetcher.search
def search(self, artist, title)
Definition: darklyrics.py:44
decode
static int decode(unsigned char *vbiline, int scale0, int scale1)
Definition: cc.cpp:67
darklyrics.LyricsFetcher.searchUrl
searchUrl
Definition: darklyrics.py:32
darklyrics.LyricsFetcher.get_lyrics_from_list
def get_lyrics_from_list(self, link)
Definition: darklyrics.py:134
darklyrics.LyricsFetcher.base_url
base_url
Definition: darklyrics.py:31
darklyrics.LyricsFetcher.__init__
def __init__(self)
Definition: darklyrics.py:30
darklyrics.buildLyrics
def buildLyrics(lyrics)
Definition: darklyrics.py:158
darklyrics.main
def main()
Definition: darklyrics.py:192
hardwareprofile.distros.mythtv_data.request.Request
def Request(url=None)
Definition: distros/mythtv_data/request.py:64
darklyrics.buildVersion
def buildVersion()
Definition: darklyrics.py:176
darklyrics.LyricsFetcher
Definition: darklyrics.py:28
darklyrics.LyricsFetcher.getAlbumName
def getAlbumName(self, url)
Definition: darklyrics.py:100
darklyrics.LyricsFetcher.get_lyrics
def get_lyrics(self, lyrics)
Definition: darklyrics.py:116
darklyrics.LyricsFetcher.findLyrics
def findLyrics(self, url, index)
Definition: darklyrics.py:78
darklyrics.LyricsFetcher.getCookie
def getCookie(self)
Definition: darklyrics.py:35