MythTV  master
lyricsScraper.py
Go to the documentation of this file.
1 #-*- coding: UTF-8 -*-
2 """
3 Scraper for http://lrcct2.ttplayer.com/
4 
5 taxigps
6 """
7 
8 import os
9 import socket
10 import urllib.request
11 import re
12 import random
13 import difflib
14 from lib.utils import *
15 
16 __title__ = "TTPlayer"
17 __priority__ = '110'
18 __lrc__ = True
19 
20 UserAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
21 
22 socket.setdefaulttimeout(10)
23 
24 LYRIC_TITLE_STRIP=["\(live[^\)]*\)", "\(acoustic[^\)]*\)",
25  "\([^\)]*mix\)", "\([^\)]*version\)",
26  "\([^\)]*edit\)", "\(feat[^\)]*\)"]
27 LYRIC_TITLE_REPLACE=[("/", "-"),(" & ", " and ")]
28 LYRIC_ARTIST_REPLACE=[("/", "-"),(" & ", " and ")]
29 
30 class ttpClient(object):
31  '''
32  privide ttplayer specific function, such as encoding artist and title,
33  generate a Id code for server authorizition.
34  (see http://ttplyrics.googlecode.com/svn/trunk/crack)
35  '''
36  @staticmethod
37  def CodeFunc(Id, data):
38  '''
39  Generate a Id Code
40  These code may be ugly coz it is translated
41  from C code which is translated from asm code
42  grabed by ollydbg from ttp_lrcs.dll.
43  (see http://ttplyrics.googlecode.com/svn/trunk/crack)
44  '''
45  length = len(data)
46 
47  tmp2=0
48  tmp3=0
49 
50  tmp1 = (Id & 0x0000FF00) >> 8 #右移8位后为x0000015F
51 
52  #tmp1 0x0000005F
53  if ((Id & 0x00FF0000) == 0):
54  tmp3 = 0x000000FF & ~tmp1 #CL 0x000000E7
55  else:
56  tmp3 = 0x000000FF & ((Id & 0x00FF0000) >> 16) #右移16后为x00000001
57 
58  tmp3 = tmp3 | ((0x000000FF & Id) << 8) #tmp3 0x00001801
59  tmp3 = tmp3 << 8 #tmp3 0x00180100
60  tmp3 = tmp3 | (0x000000FF & tmp1) #tmp3 0x0018015F
61  tmp3 = tmp3 << 8 #tmp3 0x18015F00
62  if ((Id & 0xFF000000) == 0) :
63  tmp3 = tmp3 | (0x000000FF & (~Id)) #tmp3 0x18015FE7
64  else :
65  tmp3 = tmp3 | (0x000000FF & (Id >> 24)) #右移24位后为0x00000000
66 
67  #tmp3 18015FE7
68 
69  i=length-1
70  while(i >= 0):
71  char = ord(data[i])
72  if char >= 0x80:
73  char = char - 0x100
74  tmp1 = (char + tmp2) & 0x00000000FFFFFFFF
75  tmp2 = (tmp2 << (i%2 + 4)) & 0x00000000FFFFFFFF
76  tmp2 = (tmp1 + tmp2) & 0x00000000FFFFFFFF
77  #tmp2 = (ord(data[i])) + tmp2 + ((tmp2 << (i%2 + 4)) & 0x00000000FFFFFFFF)
78  i -= 1
79 
80  #tmp2 88203cc2
81  i=0
82  tmp1=0
83  while(i<=length-1):
84  char = ord(data[i])
85  if char >= 128:
86  char = char - 256
87  tmp7 = (char + tmp1) & 0x00000000FFFFFFFF
88  tmp1 = (tmp1 << (i%2 + 3)) & 0x00000000FFFFFFFF
89  tmp1 = (tmp1 + tmp7) & 0x00000000FFFFFFFF
90  #tmp1 = (ord(data[i])) + tmp1 + ((tmp1 << (i%2 + 3)) & 0x00000000FFFFFFFF)
91  i += 1
92 
93  #EBX 5CC0B3BA
94 
95  #EDX = EBX | Id
96  #EBX = EBX | tmp3
97  tmp1 = (((((tmp2 ^ tmp3) & 0x00000000FFFFFFFF) + (tmp1 | Id)) & 0x00000000FFFFFFFF) * (tmp1 | tmp3)) & 0x00000000FFFFFFFF
98  tmp1 = (tmp1 * (tmp2 ^ Id)) & 0x00000000FFFFFFFF
99 
100  if tmp1 > 0x80000000:
101  tmp1 = tmp1 - 0x100000000
102  return tmp1
103 
104  @staticmethod
105  def EncodeArtTit(data):
106  data = data.encode('UTF-16').decode('UTF-16')
107  rtn = ''
108  for i in range(len(data)):
109  rtn += '%02x00' % ord(data[i])
110  return rtn
111 
112 
114  def __init__(self):
115  self.LIST_URL = 'http://ttlrccnc.qianqian.com/dll/lyricsvr.dll?sh?Artist=%s&Title=%s&Flags=0'
116  self.LYRIC_URL = 'http://ttlrccnc.qianqian.com/dll/lyricsvr.dll?dl?Id=%d&Code=%d&uid=01&mac=%012x'
117 
118  def get_lyrics(self, song):
119  log("%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title))
120  lyrics = Lyrics()
121  lyrics.song = song
122  lyrics.source = __title__
123  lyrics.lrc = __lrc__
124  artist = song.artist
125  title = song.title
126  # replace ampersands and the like
127  for exp in LYRIC_ARTIST_REPLACE:
128  p = re.compile(exp[0])
129  artist = p.sub(exp[1], artist)
130  for exp in LYRIC_TITLE_REPLACE:
131  p = re.compile(exp[0])
132  title = p.sub(exp[1], title)
133 
134  # strip things like "(live at Somewhere)", "(accoustic)", etc
135  for exp in LYRIC_TITLE_STRIP:
136  p = re.compile(exp)
137  title = p.sub('', title)
138 
139  # compress spaces
140  title = title.strip().replace('`','').replace('/','')
141  artist = artist.strip().replace('`','').replace('/','')
142 
143  try:
144  url = self.LIST_URL %(ttpClient.EncodeArtTit(artist.replace(' ','').lower()), ttpClient.EncodeArtTit(title.replace(' ','').lower()))
145  f = urllib.request.urlopen(url)
146  Page = f.read().decode('utf-8')
147  except:
148  log("%s: %s::%s (%d) [%s]" % (
149  __title__, self.__class__.__name__,
150  sys.exc_info()[2].tb_frame.f_code.co_name,
151  sys.exc_info()[2].tb_lineno,
152  sys.exc_info()[1]
153  ))
154  return None
155  links_query = re.compile('<lrc id=\"(.*?)\" artist=\"(.*?)\" title=\"(.*?)\"></lrc>')
156  urls = re.findall(links_query, Page)
157  links = []
158  for x in urls:
159  if (difflib.SequenceMatcher(None, artist.lower(), x[1].lower()).ratio() > 0.8) and (difflib.SequenceMatcher(None, title.lower(), x[2].lower()).ratio() > 0.8):
160  links.append((x[1] + ' - ' + x[2], x[0], x[1], x[2]))
161  if len(links) == 0:
162  return None
163  elif len(links) > 1:
164  lyrics.list = links
165  for link in links:
166  lyr = self.get_lyrics_from_list(link)
167  if lyr and lyr.startswith('['):
168  lyrics.lyrics = lyr
169  return lyrics
170  return None
171 
172  def get_lyrics_from_list(self, link):
173  title,Id,artist,song = link
174  try:
175 
176  url = self.LYRIC_URL %(int(Id),ttpClient.CodeFunc(int(Id), artist + song), random.randint(0,0xFFFFFFFFFFFF))
177  log('%s: search url: %s' % (__title__, url))
178  header = {'User-Agent':UserAgent}
179  req = urllib.request.Request(url, headers=header)
180  f = urllib.request.urlopen(req)
181  Page = f.read().decode('utf-8')
182  except:
183  log("%s: %s::%s (%d) [%s]" % (
184  __title__, self.__class__.__name__,
185  sys.exc_info()[2].tb_frame.f_code.co_name,
186  sys.exc_info()[2].tb_lineno,
187  sys.exc_info()[1]
188  ))
189  return None
190  # ttplayer occasionally returns incorrect lyrics. if we have a 'ti' and/or an 'ar' tag with a value we can check if they match the title and artist
191  if Page.startswith('[ti:'):
192  check = Page.split('\n')
193  if not check[0][4:-1] == '':
194  if (difflib.SequenceMatcher(None, song.lower(), check[0][4:-1].lower()).ratio() > 0.8):
195  return Page
196  else:
197  return ''
198  if check[1][0:4] == '[ar:' and not check[1][4:-1] == '':
199  if (difflib.SequenceMatcher(None, artist.lower(), check[1][4:-1].lower()).ratio() > 0.8):
200  return Page
201  else:
202  return ''
203  else:
204  return Page
205  elif Page.startswith('['):
206  return Page
207  return ''
ttplayer.lyricsScraper.LyricsFetcher.get_lyrics_from_list
def get_lyrics_from_list(self, link)
Definition: lyricsScraper.py:172
ttplayer.lyricsScraper.LyricsFetcher.get_lyrics
def get_lyrics(self, song)
Definition: lyricsScraper.py:118
utils
ttplayer.lyricsScraper.ttpClient
Definition: lyricsScraper.py:30
ttplayer.lyricsScraper.ttpClient.EncodeArtTit
def EncodeArtTit(data)
Definition: lyricsScraper.py:105
ttplayer.lyricsScraper.LyricsFetcher.__init__
def __init__(self)
Definition: lyricsScraper.py:114
ttplayer.lyricsScraper.ttpClient.CodeFunc
def CodeFunc(Id, data)
Definition: lyricsScraper.py:37
ttplayer.lyricsScraper.LyricsFetcher.LYRIC_URL
LYRIC_URL
Definition: lyricsScraper.py:116
ttplayer.lyricsScraper.LyricsFetcher.LIST_URL
LIST_URL
Definition: lyricsScraper.py:115
xbmc.log
None log(str msg, int level=LOGDEBUG)
Definition: xbmc.py:9
ttplayer.lyricsScraper.LyricsFetcher
Definition: lyricsScraper.py:113