MythTV master
lyricsScraper.py
Go to the documentation of this file.
1#-*- coding: UTF-8 -*-
2"""
3Scraper for https://xiami.com
4
5Taxigps
6"""
7
8import urllib.parse
9import socket
10import re
11import difflib
12import json
13import chardet
14import requests
15from utilities import *
16
17__title__ = "Xiami"
18__priority__ = '110'
19__lrc__ = True
20
21UserAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
22
23socket.setdefaulttimeout(10)
24
26 def __init__( self ):
27 self.LIST_URL = 'https://www.xiami.com/search?key=%s'
28 self.SONG_URL = 'https://www.xiami.com/song/playlist/id/%s/object_name/default/object_id/0'
29 self.session = requests.Session()
30
31 def get_lyrics(self, song):
32 log( "%s: searching lyrics for %s - %s" % (__title__, song.artist, song.title))
33 lyrics = Lyrics()
34 lyrics.song = song
35 lyrics.source = __title__
36 lyrics.lrc = __lrc__
37 keyword = "%s %s" % (song.title, song.artist)
38 url = self.LIST_URL % (urllib.parse.quote(keyword))
39 try:
40 response = self.session.get(url, headers={'User-Agent': UserAgent, 'Referer': 'https://www.xiami.com/play'})
41 result = response.text
42 except:
43 log( "%s: %s::%s (%d) [%s]" % (
44 __title__, self.__class__.__name__,
45 sys.exc_info()[ 2 ].tb_frame.f_code.co_name,
46 sys.exc_info()[ 2 ].tb_lineno,
47 sys.exc_info()[ 1 ]
48 ))
49 return None
50 match = re.compile('<td class="chkbox">.+?value="(.+?)".+?href="//www.xiami.com/song/[^"]+" title="([^"]+)".*?href="//www.xiami.com/artist/[^"]+" title="([^"]+)"', re.DOTALL).findall(result)
51 links = []
52 for x in match:
53 title = x[1]
54 artist = x[2]
55 if (difflib.SequenceMatcher(None, song.artist.lower(), artist.lower()).ratio() > 0.8) and (difflib.SequenceMatcher(None, song.title.lower(), title.lower()).ratio() > 0.8):
56 links.append( ( artist + ' - ' + title, x[0], artist, title ) )
57 if len(links) == 0:
58 return None
59 elif len(links) > 1:
60 lyrics.list = links
61 lyr = self.get_lyrics_from_list(links[0])
62 if not lyr:
63 return None
64 lyrics.lyrics = lyr
65 return lyrics
66
67 def get_lyrics_from_list(self, link):
68 title,id,artist,song = link
69 try:
70 response = self.session.get(self.SONG_URL % (id), headers={'User-Agent': UserAgent, 'Referer': 'https://www.xiami.com/play'})
71 result = response.text
72 data = json.loads(result)
73 if 'data' in data and 'trackList' in data['data'] and data['data']['trackList'] and 'lyric' in data['data']['trackList'][0] and data['data']['trackList'][0]['lyric']:
74 url = data['data']['trackList'][0]['lyric']
75 except:
76 log( "%s: %s::%s (%d) [%s]" % (
77 __title__, self.__class__.__name__,
78 sys.exc_info()[ 2 ].tb_frame.f_code.co_name,
79 sys.exc_info()[ 2 ].tb_lineno,
80 sys.exc_info()[ 1 ]
81 ))
82 return
83 try:
84 response = self.session.get(url, headers={'User-Agent': UserAgent, 'Referer': 'https://www.xiami.com/play'})
85 lyrics = response.content
86 except:
87 log( "%s: %s::%s (%d) [%s]" % (
88 __title__, self.__class__.__name__,
89 sys.exc_info()[ 2 ].tb_frame.f_code.co_name,
90 sys.exc_info()[ 2 ].tb_lineno,
91 sys.exc_info()[ 1 ]
92 ))
93 return
94 enc = chardet.detect(lyrics)
95 lyrics = lyrics.decode(enc['encoding'], 'ignore')
96 return lyrics
None log(str msg, int level=LOGDEBUG)
Definition: xbmc.py:9