MythTV master
lyricsScraper.py
Go to the documentation of this file.
1#-*- coding: UTF-8 -*-
2'''
3Scraper for http://www.viewlyrics.com
4
5PedroHLC
6https://github.com/PedroHLC/ViewLyricsOpenSearcher
7
8rikels
9https://github.com/rikels/LyricsSearch
10'''
11
12import re
13import hashlib
14import difflib
15import chardet
16import requests
17from lib.utils import *
18
19__title__ = 'MiniLyrics'
20__priority__ = '100'
21__lrc__ = True
22
23
24class MiniLyrics(object):
25 '''
26 Minilyrics specific functions
27 '''
28 @staticmethod
29 def hexToStr(hexx):
30 string = ''
31 i = 0
32 while (i < (len(hexx) - 1)):
33 string += chr(int(hexx[i] + hexx[i + 1], 16))
34 i += 2
35 return string
36
37 @staticmethod
38 def vl_enc(data, md5_extra):
39 datalen = len(data)
40 md5 = hashlib.md5()
41 md5.update(data + md5_extra)
42 hasheddata = MiniLyrics.hexToStr(md5.hexdigest())
43 j = 0
44 i = 0
45 while (i < datalen):
46 try:
47 j += data[i]
48 except TypeError:
49 j += ord(data[i])
50 i += 1
51 magickey = chr(int(round(float(j) / float(datalen))))
52 encddata = list(range(len(data)))
53 if isinstance(magickey, int):
54 pass
55 else:
56 magickey = ord(magickey)
57 for i in range(datalen):
58 if isinstance(data[i], int):
59 encddata[i] = data[i] ^ magickey
60 else:
61 encddata[i] = ord(data[i]) ^ magickey
62 try:
63 result = '\x02' + chr(magickey) + '\x04\x00\x00\x00' + str(hasheddata) + bytearray(encddata).decode('utf-8')
64 except UnicodeDecodeError:
65 ecd = chardet.detect(bytearray(encddata))
66 if ecd['encoding']:
67 try:
68 result = '\x02' + chr(magickey) + '\x04\x00\x00\x00' + str(hasheddata) + bytearray(encddata).decode(ecd['encoding'])
69 except:
70 result = '\x02' + chr(magickey) + '\x04\x00\x00\x00' + str(hasheddata) + "".join(map(chr, bytearray(encddata)))
71 else:
72 result = '\x02' + chr(magickey) + '\x04\x00\x00\x00' + str(hasheddata) + "".join(map(chr, bytearray(encddata)))
73 return result
74
75 @staticmethod
76 def vl_dec(data):
77 magickey = data[1]
78 result = ""
79 i = 22
80 datalen = len(data)
81 if isinstance(magickey, int):
82 pass
83 else:
84 magickey = ord(magickey)
85 for i in range(22, datalen):
86 if isinstance(data[i], int):
87 result += chr(data[i] ^ magickey)
88 else:
89 result += chr(ord(data[i]) ^ magickey)
90 return result
91
93 def __init__(self, *args, **kwargs):
94 self.DEBUG = kwargs['debug']
95 self.settings = kwargs['settings']
96 self.proxy = None
97
98 def htmlDecode(self,string):
99 entities = {'&apos;':'\'','&quot;':'"','&gt;':'>','&lt;':'<','&amp;':'&'}
100 for i in entities:
101 string = string.replace(i,entities[i])
102 return string
103
104 def get_lyrics(self, song):
105 log('%s: searching lyrics for %s - %s' % (__title__, song.artist, song.title), debug=self.DEBUG)
106 lyrics = Lyrics(settings=self.settings)
107 lyrics.song = song
108 lyrics.source = __title__
109 lyrics.lrc = __lrc__
110 search_url = 'http://search.crintsoft.com/searchlyrics.htm'
111 search_query_base = "<?xml version='1.0' encoding='utf-8' standalone='yes' ?><searchV1 client=\"ViewLyricsOpenSearcher\" artist=\"{artist}\" title=\"{title}\" OnlyMatched=\"1\" />"
112 search_useragent = 'MiniLyrics'
113 search_md5watermark = b'Mlv1clt4.0'
114 search_encquery = MiniLyrics.vl_enc(search_query_base.format(artist=song.artist, title=song.title).encode('utf-8'), search_md5watermark)
115 headers = {"User-Agent": "{ua}".format(ua=search_useragent),
116 "Content-Length": "{content_length}".format(content_length=len(search_encquery)),
117 "Connection": "Keep-Alive",
118 "Expect": "100-continue",
119 "Content-Type": "application/x-www-form-urlencoded"
120 }
121 try:
122 request = requests.post(search_url, data=search_encquery, headers=headers, timeout=10)
123 search_result = request.text
124 except:
125 return
126 rawdata = MiniLyrics.vl_dec(search_result)
127 # might be a better way to parse the data
128 lrcdata = rawdata.replace('\x00', '*')
129 artistmatch = re.search('artist\*(.*?)\*',lrcdata)
130 if not artistmatch:
131 return
132 titlematch = re.search('title\*(.*?)\*',lrcdata)
133 if not titlematch:
134 return
135 artist = artistmatch.group(1)
136 title = titlematch.group(1)
137 links = []
138 if (difflib.SequenceMatcher(None, song.artist.lower(), artist.lower()).ratio() > 0.8) and (difflib.SequenceMatcher(None, song.title.lower(), title.lower()).ratio() > 0.8):
139 results = re.findall('[a-z0-9/_]*?\.lrc', lrcdata)
140 for item in results:
141 links.append((artist + ' - ' + title, item, artist, title))
142 if len(links) == 0:
143 return None
144 elif len(links) > 1:
145 lyrics.list = links
146 lyr = self.get_lyrics_from_list(links[0])
147 if not lyr:
148 return None
149 lyrics.lyrics = lyr
150 return lyrics
151
152 def get_lyrics_from_list(self, link):
153 title,url,artist,song = link
154 try:
155 f = requests.get('http://search.crintsoft.com/l/' + url, timeout=10)
156 lyrics = f.content
157 except:
158 return
159 enc = chardet.detect(lyrics)
160 lyrics = lyrics.decode(enc['encoding'], 'ignore')
161 return lyrics
def __init__(self, *args, **kwargs)
None log(str msg, int level=LOGDEBUG)
Definition: xbmc.py:9