MythTV master
tedtalksXSL_api.py
Go to the documentation of this file.
1# -*- coding: UTF-8 -*-
2
3# ----------------------
4# Name: tedtalksXSL_api - XPath and XSLT functions for the TedTalks RSS/HTML itmes
5# Python Script
6# Author: R.D. Vaughan
7# Purpose: This python script is intended to perform a variety of utility functions
8# for the conversion of data to the MNV standard RSS output format.
9# See this link for the specifications:
10# http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
11#
12# License:Creative Commons GNU GPL v2
13# (http://creativecommons.org/licenses/GPL/2.0/)
14#-------------------------------------
15__title__ ="tedtalksXSL_api - XPath and XSLT functions for the TedTalks RSS/HTML"
16__author__="R.D. Vaughan"
17__purpose__='''
18This python script is intended to perform a variety of utility functions
19for the conversion of data to the MNV standard RSS output format.
20See this link for the specifications:
21http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
22'''
23
24__version__="v0.1.1"
25# 0.1.0 Initial development
26# 0.1.1 Fixed URL of Flash player due to web site change
27
28
29# Specify the class names that have XPath extention functions
30__xpathClassList__ = ['xpathFunctions', ]
31
32# Specify the XSLT extention class names. Each class is a stand lone extention function
33#__xsltExtentionList__ = ['xsltExtExample', ]
34__xsltExtentionList__ = []
35
36import os, sys, re, time, datetime, shutil, urllib.request, urllib.parse, urllib.error, string
37from copy import deepcopy
38import io
39
40
41class OutStreamEncoder(object):
42 """Wraps a stream with an encoder"""
43 def __init__(self, outstream, encoding=None):
44 self.out = outstream
45 if not encoding:
46 self.encoding = sys.getfilesystemencoding()
47 else:
48 self.encoding = encoding
49
50 def write(self, obj):
51 """Wraps the output stream, encoding Unicode strings with the specified encoding"""
52 if isinstance(obj, str):
53 obj = obj.encode(self.encoding)
54 try:
55 self.out.buffer.write(obj)
56 except OSError:
57 pass
58
59 def __getattr__(self, attr):
60 """Delegate everything but write to the stream"""
61 return getattr(self.out, attr)
62
63if isinstance(sys.stdout, io.TextIOWrapper):
64 sys.stdout = OutStreamEncoder(sys.stdout, 'utf8')
65 sys.stderr = OutStreamEncoder(sys.stderr, 'utf8')
66try:
67 from io import StringIO
68 from lxml import etree
69except Exception as e:
70 sys.stderr.write('\n! Error - Importing the "lxml" and "StringIO" python libraries failed on error(%s)\n' % e)
71 sys.exit(1)
72
73
74class xpathFunctions(object):
75 """Functions specific extending XPath
76 """
77 def __init__(self):
78 self.functList = ['tedtalksMakeItem', 'tedtalksGetItem', 'tedtalksMakeLink', 'tedtalksTitleRSS', ]
79 self.namespaces = {
80 'media': "http://search.yahoo.com/mrss/",
81 'xhtml': "http://www.w3.org/1999/xhtml",
82 'mythtv': "http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format",
83 }
84 # XPath filters
85 self.descriptionFilter = etree.XPath('//p[@id="tagline"]', namespaces=self.namespaces)
86 self.durationFilter = etree.XPath('//dl[@class="talkMedallion clearfix"]//em[@class="date"]/text()', namespaces=self.namespaces)
87 self.persistence = {}
88 self.flvPlayerLink = 'http://static.hd-trailers.net/mediaplayer/player.swf?autostart=true&backcolor=000000&frontcolor=999999&lightcolor=000000&screencolor=000000&controlbar=over&file=%s'
89
90 # end __init__()
91
92
97
98 def tedtalksMakeItem(self, context, *arg):
99 '''Generate item elements from a Video HTML page on the TedTalks site.
100 Call example: 'mnvXpath:tedtalksMakeItem(concat('http://www.ted.com', normalize-space(./@href), $paraMeter))/link'
101 return an number of item elements
102 '''
103 webURL = arg[0]
104 parmDict = self.parameterArgs( arg[1])
105
106 # Read the detailed Web page
107 try:
108 tmpHandle = urllib.request.urlopen(webURL)
109 htmlString = str(tmpHandle.read(), 'utf-8')
110 tmpHandle.close()
111 except errmsg:
112 sys.stderr.write('! Error: TedTalk web page read issue for URL(%s)\nerror(%s)\n' % (webURL, errmsg))
113 return etree.XML("<xml></xml>" )
114
115 htmlElementTree = etree.HTML(htmlString)
116
117 # Create the base element that will contain the item elements. It must have a "media" namespace
118 mediaNamespace = "http://search.yahoo.com/mrss/"
119 media = "{%s}" % mediaNamespace
120 NSMAP = {'media' : mediaNamespace}
121 elementTmp = etree.Element(media + "media", nsmap=NSMAP)
122
123 # Get and format the publishing Date
124 tmpPubDate = self.stripSubstring(htmlString, '\tpd:"', '"')
125 if tmpPubDate:
126 tmpPubDate = common.pubDate('dummy', '1 '+tmpPubDate, "%d %b %Y")
127 else:
128 tmpPubDate = common.pubDate('dummy', '')
129
130 # Get the flv link
131 if self.stripSubstring(htmlString, '\ths:"', '"'):
132 tmpFlvLink = self.flvPlayerLink % 'http://video.ted.com/%s' % self.stripSubstring(htmlString, '\ths:"', '"').replace('high', parmDict['flv'])
133 else:
134 tmpFlvLink = webURL
135
136 # Get the download link
137 tmpFileName = self.stripSubstring(htmlString, '\ths:"talks/dynamic/', '-')
138 tmpDownloadLink = 'http://video.ted.com/talks/podcast/%s' % tmpFileName
139 if parmDict['download'] == 'HD':
140 tmpDownloadLink+='_480.mp4'
141 else:
142 tmpDownloadLink+='.mp4'
143
144 # Get thumbnail link
145 tmpThumbNail = self.stripSubstring(htmlString, 'amp;su=', '&amp')
146
147 # Get item description
148 tmpDesc = self.descriptionFilter(htmlElementTree)
149 if len(tmpDesc):
150 tmpDesc = tmpDesc[0].text
151 else:
152 tmpDesc = ''
153
154 # Get duration
155 tmpDuration = self.durationFilter(htmlElementTree)
156 if len(tmpDuration):
157 index = tmpDuration[0].find(' ')
158 if index != -1:
159 tmpDuration = common.convertDuration('dummy', tmpDuration[0][:index])
160 else:
161 tmpDuration = ''
162 else:
163 tmpDuration = ''
164
165 # Add Item elements and attributes
166 etree.SubElement(elementTmp, "pubDate").text = tmpPubDate
167 etree.SubElement(elementTmp, "description").text = tmpDesc
168 etree.SubElement(elementTmp, "link").text = tmpFlvLink
169 tmpgroup = etree.SubElement(elementTmp, media + "group")
170 tmpTNail = etree.SubElement(tmpgroup, media + "thumbnail")
171 tmpTNail.attrib['url'] = tmpThumbNail
172 tmpContent = etree.SubElement(tmpgroup, media + "content")
173 tmpContent.attrib['url'] = tmpDownloadLink
174 tmpContent.attrib['duration'] = tmpDuration
175 tmpContent.attrib['lang'] = 'en'
176
177 self.persistence[webURL] = deepcopy(elementTmp)
178 return elementTmp
179 # end tedtalksMakeItem()
180
181 def tedtalksGetItem(self, context, *arg):
182 '''Return item elements that were previously created in "tedtalksMakeItem" call
183 Call example: 'mnvXpath:tedtalksGetItem(concat('http://www.ted.com', normalize-space(./@href))/*'
184 return an number of item elements
185 '''
186 elementTmp = self.persistence[arg[0]]
187 del self.persistence[arg[0]]
188 return elementTmp
189 # end tedtalksGetItem()
190
191 def tedtalksMakeLink(self, context, *arg):
192 '''Return item elements that were previously created in "tedtalksMakeItem" call
193 Call example: 'mnvXpath:tedtalksMakeLink(enclosure/@url, $paraMeter)'
194 return a link for playing the flv file
195 '''
196 tmpDownloadLink = arg[0]
197 parmDict = self.parameterArgs(arg[1])
198 index = tmpDownloadLink.rfind('/')
199 videoFileName = 'http://video.ted.com/talks/dynamic%s' % tmpDownloadLink[index:].replace('_480', '').replace('.mp4', '')
200 videoFileName+='-%s.flv' % parmDict['flv']
201 return self.flvPlayerLink % videoFileName
202 # end tedtalksMakeLink()
203
204 def tedtalksTitleRSS(self, context, *arg):
205 '''Return item elements that were previously created in "tedtalksMakeItem" call
206 Call example: 'mnvXpath:tedtalksTitleRSS(string(title))'
207 return a massaged title string
208 '''
209 title = arg[0]
210 index = title.rfind('-')
211 if index == -1:
212 return title
213 return title[:index].strip()
214 # end tedtalksTitleRSS()
215
216 def stripSubstring(self, string, startText, terminatorChar):
217 '''Return a substring terminated by specific character(s)
218 return a substring
219 '''
220 index = string.find(startText)
221 if index == -1:
222 return ''
223 string = string[index+len(startText):]
224 index = string.find(terminatorChar)
225 if index == -1:
226 return ''
227 return string[:index].strip()
228 # end stripSubstring()
229
230 def parameterArgs(self, parameters, terminatorChar=';'):
231 '''Set the parameters for TedTalks
232 return a dictionary of parameters
233 '''
234 paramDict = {}
235 args = parameters.split(terminatorChar)
236 for arg in args:
237 tmp = arg.split('=')
238 paramDict[tmp[0]] = tmp[1]
239 return paramDict
240 # end parameterArgs()
241
242
247
248
253
254
def tedtalksMakeItem(self, context, *arg)
Start of XPath extension functions.
def parameterArgs(self, parameters, terminatorChar=';')
def stripSubstring(self, string, startText, terminatorChar)
static pid_list_t::iterator find(const PIDInfoMap &map, pid_list_t &list, pid_list_t::iterator begin, pid_list_t::iterator end, bool find_open)