MythTV master
lyricsScraper.py
Go to the documentation of this file.
1#-*- coding: UTF-8 -*-
2"""
3Scraper for http://lrcct2.ttplayer.com/
4
5taxigps
6"""
7
8import os
9import socket
10import urllib.request
11import re
12import random
13import difflib
14from lib.utils import *
15
16__title__ = "TTPlayer"
17__priority__ = '110'
18__lrc__ = True
19
20UserAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
21
22socket.setdefaulttimeout(10)
23
24LYRIC_TITLE_STRIP=["\‍(live[^\‍)]*\‍)", "\‍(acoustic[^\‍)]*\‍)",
25 "\‍([^\‍)]*mix\‍)", "\‍([^\‍)]*version\‍)",
26 "\‍([^\‍)]*edit\‍)", "\‍(feat[^\‍)]*\‍)"]
27LYRIC_TITLE_REPLACE=[("/", "-"),(" & ", " and ")]
28LYRIC_ARTIST_REPLACE=[("/", "-"),(" & ", " and ")]
29
30class ttpClient(object):
31 '''
32 privide ttplayer specific function, such as encoding artist and title,
33 generate a Id code for server authorizition.
34 (see http://ttplyrics.googlecode.com/svn/trunk/crack)
35 '''
36 @staticmethod
37 def CodeFunc(Id, data):
38 '''
39 Generate a Id Code
40 These code may be ugly coz it is translated
41 from C code which is translated from asm code
42 grabed by ollydbg from ttp_lrcs.dll.
43 (see http://ttplyrics.googlecode.com/svn/trunk/crack)
44 '''
45 length = len(data)
46
47 tmp2=0
48 tmp3=0
49
50 tmp1 = (Id & 0x0000FF00) >> 8 #右移8位后为x0000015F
51
52 #tmp1 0x0000005F
53 if ((Id & 0x00FF0000) == 0):
54 tmp3 = 0x000000FF & ~tmp1 #CL 0x000000E7
55 else:
56 tmp3 = 0x000000FF & ((Id & 0x00FF0000) >> 16) #右移16后为x00000001
57
58 tmp3 = tmp3 | ((0x000000FF & Id) << 8) #tmp3 0x00001801
59 tmp3 = tmp3 << 8 #tmp3 0x00180100
60 tmp3 = tmp3 | (0x000000FF & tmp1) #tmp3 0x0018015F
61 tmp3 = tmp3 << 8 #tmp3 0x18015F00
62 if ((Id & 0xFF000000) == 0) :
63 tmp3 = tmp3 | (0x000000FF & (~Id)) #tmp3 0x18015FE7
64 else :
65 tmp3 = tmp3 | (0x000000FF & (Id >> 24)) #右移24位后为0x00000000
66
67 #tmp3 18015FE7
68
69 i=length-1
70 while(i >= 0):
71 char = ord(data[i])
72 if char >= 0x80:
73 char = char - 0x100
74 tmp1 = (char + tmp2) & 0x00000000FFFFFFFF
75 tmp2 = (tmp2 << (i%2 + 4)) & 0x00000000FFFFFFFF
76 tmp2 = (tmp1 + tmp2) & 0x00000000FFFFFFFF
77 #tmp2 = (ord(data[i])) + tmp2 + ((tmp2 << (i%2 + 4)) & 0x00000000FFFFFFFF)
78 i -= 1
79
80 #tmp2 88203cc2
81 i=0
82 tmp1=0
83 while(i<=length-1):
84 char = ord(data[i])
85 if char >= 128:
86 char = char - 256
87 tmp7 = (char + tmp1) & 0x00000000FFFFFFFF
88 tmp1 = (tmp1 << (i%2 + 3)) & 0x00000000FFFFFFFF
89 tmp1 = (tmp1 + tmp7) & 0x00000000FFFFFFFF
90 #tmp1 = (ord(data[i])) + tmp1 + ((tmp1 << (i%2 + 3)) & 0x00000000FFFFFFFF)
91 i += 1
92
93 #EBX 5CC0B3BA
94
95 #EDX = EBX | Id
96 #EBX = EBX | tmp3
97 tmp1 = (((((tmp2 ^ tmp3) & 0x00000000FFFFFFFF) + (tmp1 | Id)) & 0x00000000FFFFFFFF) * (tmp1 | tmp3)) & 0x00000000FFFFFFFF
98 tmp1 = (tmp1 * (tmp2 ^ Id)) & 0x00000000FFFFFFFF
99
100 if tmp1 > 0x80000000:
101 tmp1 = tmp1 - 0x100000000
102 return tmp1
103
104 @staticmethod
105 def EncodeArtTit(data):
106 data = data.encode('UTF-16').decode('UTF-16')
107 rtn = ''
108 for i in range(len(data)):
109 rtn += '%02x00' % ord(data[i])
110 return rtn
111
112
114 def __init__(self):
115 self.LIST_URL = 'http://ttlrccnc.qianqian.com/dll/lyricsvr.dll?sh?Artist=%s&Title=%s&Flags=0'
116 self.LYRIC_URL = 'http://ttlrccnc.qianqian.com/dll/lyricsvr.dll?dl?Id=%d&Code=%d&uid=01&mac=%012x'
117
118 def get_lyrics(self, song):
119 log("%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title))
120 lyrics = Lyrics()
121 lyrics.song = song
122 lyrics.source = __title__
123 lyrics.lrc = __lrc__
124 artist = song.artist
125 title = song.title
126 # replace ampersands and the like
127 for exp in LYRIC_ARTIST_REPLACE:
128 p = re.compile(exp[0])
129 artist = p.sub(exp[1], artist)
130 for exp in LYRIC_TITLE_REPLACE:
131 p = re.compile(exp[0])
132 title = p.sub(exp[1], title)
133
134 # strip things like "(live at Somewhere)", "(accoustic)", etc
135 for exp in LYRIC_TITLE_STRIP:
136 p = re.compile(exp)
137 title = p.sub('', title)
138
139 # compress spaces
140 title = title.strip().replace('`','').replace('/','')
141 artist = artist.strip().replace('`','').replace('/','')
142
143 try:
144 url = self.LIST_URL %(ttpClient.EncodeArtTit(artist.replace(' ','').lower()), ttpClient.EncodeArtTit(title.replace(' ','').lower()))
145 f = urllib.request.urlopen(url)
146 Page = f.read().decode('utf-8')
147 except:
148 log("%s: %s::%s (%d) [%s]" % (
149 __title__, self.__class__.__name__,
150 sys.exc_info()[2].tb_frame.f_code.co_name,
151 sys.exc_info()[2].tb_lineno,
152 sys.exc_info()[1]
153 ))
154 return None
155 links_query = re.compile('<lrc id=\"(.*?)\" artist=\"(.*?)\" title=\"(.*?)\"></lrc>')
156 urls = re.findall(links_query, Page)
157 links = []
158 for x in urls:
159 if (difflib.SequenceMatcher(None, artist.lower(), x[1].lower()).ratio() > 0.8) and (difflib.SequenceMatcher(None, title.lower(), x[2].lower()).ratio() > 0.8):
160 links.append((x[1] + ' - ' + x[2], x[0], x[1], x[2]))
161 if len(links) == 0:
162 return None
163 elif len(links) > 1:
164 lyrics.list = links
165 for link in links:
166 lyr = self.get_lyrics_from_list(link)
167 if lyr and lyr.startswith('['):
168 lyrics.lyrics = lyr
169 return lyrics
170 return None
171
172 def get_lyrics_from_list(self, link):
173 title,Id,artist,song = link
174 try:
175
176 url = self.LYRIC_URL %(int(Id),ttpClient.CodeFunc(int(Id), artist + song), random.randint(0,0xFFFFFFFFFFFF))
177 log('%s: search url: %s' % (__title__, url))
178 header = {'User-Agent':UserAgent}
179 req = urllib.request.Request(url, headers=header)
180 f = urllib.request.urlopen(req)
181 Page = f.read().decode('utf-8')
182 except:
183 log("%s: %s::%s (%d) [%s]" % (
184 __title__, self.__class__.__name__,
185 sys.exc_info()[2].tb_frame.f_code.co_name,
186 sys.exc_info()[2].tb_lineno,
187 sys.exc_info()[1]
188 ))
189 return None
190 # ttplayer occasionally returns incorrect lyrics. if we have a 'ti' and/or an 'ar' tag with a value we can check if they match the title and artist
191 if Page.startswith('[ti:'):
192 check = Page.split('\n')
193 if not check[0][4:-1] == '':
194 if (difflib.SequenceMatcher(None, song.lower(), check[0][4:-1].lower()).ratio() > 0.8):
195 return Page
196 else:
197 return ''
198 if check[1][0:4] == '[ar:' and not check[1][4:-1] == '':
199 if (difflib.SequenceMatcher(None, artist.lower(), check[1][4:-1].lower()).ratio() > 0.8):
200 return Page
201 else:
202 return ''
203 else:
204 return Page
205 elif Page.startswith('['):
206 return Page
207 return ''
None log(str msg, int level=LOGDEBUG)
Definition: xbmc.py:9