MythTV  master
tedtalksXSL_api.py
Go to the documentation of this file.
1 # -*- coding: UTF-8 -*-
2 
3 # ----------------------
4 # Name: tedtalksXSL_api - XPath and XSLT functions for the TedTalks RSS/HTML itmes
5 # Python Script
6 # Author: R.D. Vaughan
7 # Purpose: This python script is intended to perform a variety of utility functions
8 # for the conversion of data to the MNV standard RSS output format.
9 # See this link for the specifications:
10 # http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
11 #
12 # License:Creative Commons GNU GPL v2
13 # (http://creativecommons.org/licenses/GPL/2.0/)
14 #-------------------------------------
15 __title__ ="tedtalksXSL_api - XPath and XSLT functions for the TedTalks RSS/HTML"
16 __author__="R.D. Vaughan"
17 __purpose__='''
18 This python script is intended to perform a variety of utility functions
19 for the conversion of data to the MNV standard RSS output format.
20 See this link for the specifications:
21 http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
22 '''
23 
24 __version__="v0.1.1"
25 # 0.1.0 Initial development
26 # 0.1.1 Fixed URL of Flash player due to web site change
27 
28 
29 # Specify the class names that have XPath extention functions
30 __xpathClassList__ = ['xpathFunctions', ]
31 
32 # Specify the XSLT extention class names. Each class is a stand lone extention function
33 #__xsltExtentionList__ = ['xsltExtExample', ]
34 __xsltExtentionList__ = []
35 
36 import os, sys, re, time, datetime, shutil, urllib.request, urllib.parse, urllib.error, string
37 from copy import deepcopy
38 import io
39 
40 
41 class OutStreamEncoder(object):
42  """Wraps a stream with an encoder"""
43  def __init__(self, outstream, encoding=None):
44  self.out = outstream
45  if not encoding:
46  self.encoding = sys.getfilesystemencoding()
47  else:
48  self.encoding = encoding
49 
50  def write(self, obj):
51  """Wraps the output stream, encoding Unicode strings with the specified encoding"""
52  if isinstance(obj, str):
53  obj = obj.encode(self.encoding)
54  try:
55  self.out.buffer.write(obj)
56  except OSError:
57  pass
58 
59  def __getattr__(self, attr):
60  """Delegate everything but write to the stream"""
61  return getattr(self.out, attr)
62 
63 if isinstance(sys.stdout, io.TextIOWrapper):
64  sys.stdout = OutStreamEncoder(sys.stdout, 'utf8')
65  sys.stderr = OutStreamEncoder(sys.stderr, 'utf8')
66 try:
67  from io import StringIO
68  from lxml import etree
69 except Exception as e:
70  sys.stderr.write('\n! Error - Importing the "lxml" and "StringIO" python libraries failed on error(%s)\n' % e)
71  sys.exit(1)
72 
73 # Check that the lxml library is current enough
74 # From the lxml documents it states: (http://codespeak.net/lxml/installation.html)
75 # "If you want to use XPath, do not use libxml2 2.6.27. We recommend libxml2 2.7.2 or later"
76 # Testing was performed with the Ubuntu 9.10 "python-lxml" version "2.1.5-1ubuntu2" repository package
77 version = ''
78 for digit in etree.LIBXML_VERSION:
79  version+=str(digit)+'.'
80 version = version[:-1]
81 if version < '2.7.2':
82  sys.stderr.write('''
83 ! Error - The installed version of the "lxml" python library "libxml" version is too old.
84  At least "libxml" version 2.7.2 must be installed. Your version is (%s).
85 ''' % version)
86  sys.exit(1)
87 
88 
89 class xpathFunctions(object):
90  """Functions specific extending XPath
91  """
92  def __init__(self):
93  self.functList = ['tedtalksMakeItem', 'tedtalksGetItem', 'tedtalksMakeLink', 'tedtalksTitleRSS', ]
94  self.namespaces = {
95  'media': "http://search.yahoo.com/mrss/",
96  'xhtml': "http://www.w3.org/1999/xhtml",
97  'mythtv': "http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format",
98  }
99  # XPath filters
100  self.descriptionFilter = etree.XPath('//p[@id="tagline"]', namespaces=self.namespaces)
101  self.durationFilter = etree.XPath('//dl[@class="talkMedallion clearfix"]//em[@class="date"]/text()', namespaces=self.namespaces)
102  self.persistence = {}
103  self.flvPlayerLink = 'http://static.hd-trailers.net/mediaplayer/player.swf?autostart=true&backcolor=000000&frontcolor=999999&lightcolor=000000&screencolor=000000&controlbar=over&file=%s'
104 
105  # end __init__()
106 
107 
112 
113  def tedtalksMakeItem(self, context, *arg):
114  '''Generate item elements from a Video HTML page on the TedTalks site.
115  Call example: 'mnvXpath:tedtalksMakeItem(concat('http://www.ted.com', normalize-space(./@href), $paraMeter))/link'
116  return an number of item elements
117  '''
118  webURL = arg[0]
119  parmDict = self.parameterArgs( arg[1])
120 
121  # Read the detailed Web page
122  try:
123  tmpHandle = urllib.request.urlopen(webURL)
124  htmlString = str(tmpHandle.read(), 'utf-8')
125  tmpHandle.close()
126  except errmsg:
127  sys.stderr.write('! Error: TedTalk web page read issue for URL(%s)\nerror(%s)\n' % (webURL, errmsg))
128  return etree.XML("<xml></xml>" )
129 
130  htmlElementTree = etree.HTML(htmlString)
131 
132  # Create the base element that will contain the item elements. It must have a "media" namespace
133  mediaNamespace = "http://search.yahoo.com/mrss/"
134  media = "{%s}" % mediaNamespace
135  NSMAP = {'media' : mediaNamespace}
136  elementTmp = etree.Element(media + "media", nsmap=NSMAP)
137 
138  # Get and format the publishing Date
139  tmpPubDate = self.stripSubstring(htmlString, '\tpd:"', '"')
140  if tmpPubDate:
141  tmpPubDate = common.pubDate('dummy', '1 '+tmpPubDate, "%d %b %Y")
142  else:
143  tmpPubDate = common.pubDate('dummy', '')
144 
145  # Get the flv link
146  if self.stripSubstring(htmlString, '\ths:"', '"'):
147  tmpFlvLink = self.flvPlayerLink % 'http://video.ted.com/%s' % self.stripSubstring(htmlString, '\ths:"', '"').replace('high', parmDict['flv'])
148  else:
149  tmpFlvLink = webURL
150 
151  # Get the download link
152  tmpFileName = self.stripSubstring(htmlString, '\ths:"talks/dynamic/', '-')
153  tmpDownloadLink = 'http://video.ted.com/talks/podcast/%s' % tmpFileName
154  if parmDict['download'] == 'HD':
155  tmpDownloadLink+='_480.mp4'
156  else:
157  tmpDownloadLink+='.mp4'
158 
159  # Get thumbnail link
160  tmpThumbNail = self.stripSubstring(htmlString, 'amp;su=', '&amp')
161 
162  # Get item description
163  tmpDesc = self.descriptionFilter(htmlElementTree)
164  if len(tmpDesc):
165  tmpDesc = tmpDesc[0].text
166  else:
167  tmpDesc = ''
168 
169  # Get duration
170  tmpDuration = self.durationFilter(htmlElementTree)
171  if len(tmpDuration):
172  index = tmpDuration[0].find(' ')
173  if index != -1:
174  tmpDuration = common.convertDuration('dummy', tmpDuration[0][:index])
175  else:
176  tmpDuration = ''
177  else:
178  tmpDuration = ''
179 
180  # Add Item elements and attributes
181  etree.SubElement(elementTmp, "pubDate").text = tmpPubDate
182  etree.SubElement(elementTmp, "description").text = tmpDesc
183  etree.SubElement(elementTmp, "link").text = tmpFlvLink
184  tmpgroup = etree.SubElement(elementTmp, media + "group")
185  tmpTNail = etree.SubElement(tmpgroup, media + "thumbnail")
186  tmpTNail.attrib['url'] = tmpThumbNail
187  tmpContent = etree.SubElement(tmpgroup, media + "content")
188  tmpContent.attrib['url'] = tmpDownloadLink
189  tmpContent.attrib['duration'] = tmpDuration
190  tmpContent.attrib['lang'] = 'en'
191 
192  self.persistence[webURL] = deepcopy(elementTmp)
193  return elementTmp
194  # end tedtalksMakeItem()
195 
196  def tedtalksGetItem(self, context, *arg):
197  '''Return item elements that were previously created in "tedtalksMakeItem" call
198  Call example: 'mnvXpath:tedtalksGetItem(concat('http://www.ted.com', normalize-space(./@href))/*'
199  return an number of item elements
200  '''
201  elementTmp = self.persistence[arg[0]]
202  del self.persistence[arg[0]]
203  return elementTmp
204  # end tedtalksGetItem()
205 
206  def tedtalksMakeLink(self, context, *arg):
207  '''Return item elements that were previously created in "tedtalksMakeItem" call
208  Call example: 'mnvXpath:tedtalksMakeLink(enclosure/@url, $paraMeter)'
209  return a link for playing the flv file
210  '''
211  tmpDownloadLink = arg[0]
212  parmDict = self.parameterArgs(arg[1])
213  index = tmpDownloadLink.rfind('/')
214  videoFileName = 'http://video.ted.com/talks/dynamic%s' % tmpDownloadLink[index:].replace('_480', '').replace('.mp4', '')
215  videoFileName+='-%s.flv' % parmDict['flv']
216  return self.flvPlayerLink % videoFileName
217  # end tedtalksMakeLink()
218 
219  def tedtalksTitleRSS(self, context, *arg):
220  '''Return item elements that were previously created in "tedtalksMakeItem" call
221  Call example: 'mnvXpath:tedtalksTitleRSS(string(title))'
222  return a massaged title string
223  '''
224  title = arg[0]
225  index = title.rfind('-')
226  if index == -1:
227  return title
228  return title[:index].strip()
229  # end tedtalksTitleRSS()
230 
231  def stripSubstring(self, string, startText, terminatorChar):
232  '''Return a substring terminated by specific character(s)
233  return a substring
234  '''
235  index = string.find(startText)
236  if index == -1:
237  return ''
238  string = string[index+len(startText):]
239  index = string.find(terminatorChar)
240  if index == -1:
241  return ''
242  return string[:index].strip()
243  # end stripSubstring()
244 
245  def parameterArgs(self, parameters, terminatorChar=';'):
246  '''Set the parameters for TedTalks
247  return a dictionary of parameters
248  '''
249  paramDict = {}
250  args = parameters.split(terminatorChar)
251  for arg in args:
252  tmp = arg.split('=')
253  paramDict[tmp[0]] = tmp[1]
254  return paramDict
255  # end parameterArgs()
256 
257 
262 
263 
268 
269 
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.tedtalksMakeItem
def tedtalksMakeItem(self, context, *arg)
Start of XPath extension functions.
Definition: tedtalksXSL_api.py:113
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.__init__
def __init__(self)
Definition: tedtalksXSL_api.py:92
nv_python_libs.xsltfunctions.tedtalksXSL_api.OutStreamEncoder
Definition: tedtalksXSL_api.py:41
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.functList
functList
Definition: tedtalksXSL_api.py:93
nv_python_libs.xsltfunctions.tedtalksXSL_api.OutStreamEncoder.encoding
encoding
Definition: tedtalksXSL_api.py:46
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.parameterArgs
def parameterArgs(self, parameters, terminatorChar=';')
Definition: tedtalksXSL_api.py:245
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.descriptionFilter
descriptionFilter
Definition: tedtalksXSL_api.py:100
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.tedtalksGetItem
def tedtalksGetItem(self, context, *arg)
Definition: tedtalksXSL_api.py:196
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.namespaces
namespaces
Definition: tedtalksXSL_api.py:94
nv_python_libs.xsltfunctions.tedtalksXSL_api.OutStreamEncoder.__init__
def __init__(self, outstream, encoding=None)
Definition: tedtalksXSL_api.py:43
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.tedtalksMakeLink
def tedtalksMakeLink(self, context, *arg)
Definition: tedtalksXSL_api.py:206
nv_python_libs.xsltfunctions.tedtalksXSL_api.OutStreamEncoder.__getattr__
def __getattr__(self, attr)
Definition: tedtalksXSL_api.py:59
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.flvPlayerLink
flvPlayerLink
Definition: tedtalksXSL_api.py:103
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions
Definition: tedtalksXSL_api.py:89
nv_python_libs.xsltfunctions.tedtalksXSL_api.OutStreamEncoder.write
def write(self, obj)
Definition: tedtalksXSL_api.py:50
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.tedtalksTitleRSS
def tedtalksTitleRSS(self, context, *arg)
Definition: tedtalksXSL_api.py:219
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.durationFilter
durationFilter
Definition: tedtalksXSL_api.py:101
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.persistence
persistence
Definition: tedtalksXSL_api.py:102
nv_python_libs.xsltfunctions.tedtalksXSL_api.xpathFunctions.stripSubstring
def stripSubstring(self, string, startText, terminatorChar)
Definition: tedtalksXSL_api.py:231
find
static pid_list_t::iterator find(const PIDInfoMap &map, pid_list_t &list, pid_list_t::iterator begin, pid_list_t::iterator end, bool find_open)
Definition: dvbstreamhandler.cpp:363
nv_python_libs.xsltfunctions.tedtalksXSL_api.OutStreamEncoder.out
out
Definition: tedtalksXSL_api.py:44