MythTV  master
tedtalksXSL_api.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: UTF-8 -*-
3 # ----------------------
4 # Name: tedtalksXSL_api - XPath and XSLT functions for the TedTalks RSS/HTML itmes
5 # Python Script
6 # Author: R.D. Vaughan
7 # Purpose: This python script is intended to perform a variety of utility functions
8 # for the conversion of data to the MNV standard RSS output format.
9 # See this link for the specifications:
10 # http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
11 #
12 # License:Creative Commons GNU GPL v2
13 # (http://creativecommons.org/licenses/GPL/2.0/)
14 #-------------------------------------
15 __title__ ="tedtalksXSL_api - XPath and XSLT functions for the TedTalks RSS/HTML"
16 __author__="R.D. Vaughan"
17 __purpose__='''
18 This python script is intended to perform a variety of utility functions
19 for the conversion of data to the MNV standard RSS output format.
20 See this link for the specifications:
21 http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
22 '''
23 
24 __version__="v0.1.1"
25 # 0.1.0 Initial development
26 # 0.1.1 Fixed URL of Flash player due to web site change
27 
28 
29 # Specify the class names that have XPath extention functions
30 __xpathClassList__ = ['xpathFunctions', ]
31 
32 # Specify the XSLT extention class names. Each class is a stand lone extention function
33 #__xsltExtentionList__ = ['xsltExtExample', ]
34 __xsltExtentionList__ = []
35 
36 import os, sys, re, time, datetime, shutil, urllib, string
37 from copy import deepcopy
38 
39 
40 class OutStreamEncoder(object):
41  """Wraps a stream with an encoder"""
42  def __init__(self, outstream, encoding=None):
43  self.out = outstream
44  if not encoding:
45  self.encoding = sys.getfilesystemencoding()
46  else:
47  self.encoding = encoding
48 
49  def write(self, obj):
50  """Wraps the output stream, encoding Unicode strings with the specified encoding"""
51  if isinstance(obj, unicode):
52  try:
53  self.out.write(obj.encode(self.encoding))
54  except IOError:
55  pass
56  else:
57  try:
58  self.out.write(obj)
59  except IOError:
60  pass
61 
62  def __getattr__(self, attr):
63  """Delegate everything but write to the stream"""
64  return getattr(self.out, attr)
65 sys.stdout = OutStreamEncoder(sys.stdout, 'utf8')
66 sys.stderr = OutStreamEncoder(sys.stderr, 'utf8')
67 
68 try:
69  from StringIO import StringIO
70  from lxml import etree
71 except Exception, e:
72  sys.stderr.write(u'\n! Error - Importing the "lxml" and "StringIO" python libraries failed on error(%s)\n' % e)
73  sys.exit(1)
74 
75 # Check that the lxml library is current enough
76 # From the lxml documents it states: (http://codespeak.net/lxml/installation.html)
77 # "If you want to use XPath, do not use libxml2 2.6.27. We recommend libxml2 2.7.2 or later"
78 # Testing was performed with the Ubuntu 9.10 "python-lxml" version "2.1.5-1ubuntu2" repository package
79 version = ''
80 for digit in etree.LIBXML_VERSION:
81  version+=str(digit)+'.'
82 version = version[:-1]
83 if version < '2.7.2':
84  sys.stderr.write(u'''
85 ! Error - The installed version of the "lxml" python library "libxml" version is too old.
86  At least "libxml" version 2.7.2 must be installed. Your version is (%s).
87 ''' % version)
88  sys.exit(1)
89 
90 
91 class xpathFunctions(object):
92  """Functions specific extending XPath
93  """
94  def __init__(self):
95  self.functList = ['tedtalksMakeItem', 'tedtalksGetItem', 'tedtalksMakeLink', 'tedtalksTitleRSS', ]
96  self.namespaces = {
97  'media': u"http://search.yahoo.com/mrss/",
98  'xhtml': u"http://www.w3.org/1999/xhtml",
99  'mythtv': "http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format",
100  }
101  # XPath filters
102  self.descriptionFilter = etree.XPath('//p[@id="tagline"]', namespaces=self.namespaces)
103  self.durationFilter = etree.XPath('//dl[@class="talkMedallion clearfix"]//em[@class="date"]/text()', namespaces=self.namespaces)
104  self.persistence = {}
105  self.flvPlayerLink = u'http://static.hd-trailers.net/mediaplayer/player.swf?autostart=true&backcolor=000000&frontcolor=999999&lightcolor=000000&screencolor=000000&controlbar=over&file=%s'
106 
107  # end __init__()
108 
109 
114 
115  def tedtalksMakeItem(self, context, *arg):
116  '''Generate item elements from a Video HTML page on the TedTalks site.
117  Call example: 'mnvXpath:tedtalksMakeItem(concat('http://www.ted.com', normalize-space(./@href), $paraMeter))/link'
118  return an number of item elements
119  '''
120  webURL = arg[0]
121  parmDict = self.parameterArgs( arg[1])
122 
123  # Read the detailed Web page
124  try:
125  tmpHandle = urllib.urlopen(webURL)
126  htmlString = unicode(tmpHandle.read(), 'utf-8')
127  tmpHandle.close()
128  except errmsg:
129  sys.stderr.write(u'! Error: TedTalk web page read issue for URL(%s)\nerror(%s)\n' % (webURL, errmsg))
130  return etree.XML(u"<xml></xml>" )
131 
132  htmlElementTree = etree.HTML(htmlString)
133 
134  # Create the base element that will contain the item elements. It must have a "media" namespace
135  mediaNamespace = "http://search.yahoo.com/mrss/"
136  media = "{%s}" % mediaNamespace
137  NSMAP = {'media' : mediaNamespace}
138  elementTmp = etree.Element(media + "media", nsmap=NSMAP)
139 
140  # Get and format the publishing Date
141  tmpPubDate = self.stripSubstring(htmlString, '\tpd:"', '"')
142  if tmpPubDate:
143  tmpPubDate = common.pubDate('dummy', u'1 '+tmpPubDate, "%d %b %Y")
144  else:
145  tmpPubDate = common.pubDate('dummy', u'')
146 
147  # Get the flv link
148  if self.stripSubstring(htmlString, '\ths:"', '"'):
149  tmpFlvLink = self.flvPlayerLink % u'http://video.ted.com/%s' % self.stripSubstring(htmlString, '\ths:"', '"').replace('high', parmDict['flv'])
150  else:
151  tmpFlvLink = webURL
152 
153  # Get the download link
154  tmpFileName = self.stripSubstring(htmlString, '\ths:"talks/dynamic/', '-')
155  tmpDownloadLink = u'http://video.ted.com/talks/podcast/%s' % tmpFileName
156  if parmDict['download'] == 'HD':
157  tmpDownloadLink+='_480.mp4'
158  else:
159  tmpDownloadLink+='.mp4'
160 
161  # Get thumbnail link
162  tmpThumbNail = self.stripSubstring(htmlString, 'amp;su=', '&amp')
163 
164  # Get item description
165  tmpDesc = self.descriptionFilter(htmlElementTree)
166  if len(tmpDesc):
167  tmpDesc = tmpDesc[0].text
168  else:
169  tmpDesc = u''
170 
171  # Get duration
172  tmpDuration = self.durationFilter(htmlElementTree)
173  if len(tmpDuration):
174  index = tmpDuration[0].find(' ')
175  if index != -1:
176  tmpDuration = common.convertDuration('dummy', tmpDuration[0][:index])
177  else:
178  tmpDuration = u''
179  else:
180  tmpDuration = u''
181 
182  # Add Item elements and attributes
183  etree.SubElement(elementTmp, "pubDate").text = tmpPubDate
184  etree.SubElement(elementTmp, "description").text = tmpDesc
185  etree.SubElement(elementTmp, "link").text = tmpFlvLink
186  tmpgroup = etree.SubElement(elementTmp, media + "group")
187  tmpTNail = etree.SubElement(tmpgroup, media + "thumbnail")
188  tmpTNail.attrib['url'] = tmpThumbNail
189  tmpContent = etree.SubElement(tmpgroup, media + "content")
190  tmpContent.attrib['url'] = tmpDownloadLink
191  tmpContent.attrib['duration'] = tmpDuration
192  tmpContent.attrib['lang'] = u'en'
193 
194  self.persistence[webURL] = deepcopy(elementTmp)
195  return elementTmp
196  # end tedtalksMakeItem()
197 
198  def tedtalksGetItem(self, context, *arg):
199  '''Return item elements that were previously created in "tedtalksMakeItem" call
200  Call example: 'mnvXpath:tedtalksGetItem(concat('http://www.ted.com', normalize-space(./@href))/*'
201  return an number of item elements
202  '''
203  elementTmp = self.persistence[arg[0]]
204  del self.persistence[arg[0]]
205  return elementTmp
206  # end tedtalksGetItem()
207 
208  def tedtalksMakeLink(self, context, *arg):
209  '''Return item elements that were previously created in "tedtalksMakeItem" call
210  Call example: 'mnvXpath:tedtalksMakeLink(enclosure/@url, $paraMeter)'
211  return a link for playing the flv file
212  '''
213  tmpDownloadLink = arg[0]
214  parmDict = self.parameterArgs(arg[1])
215  index = tmpDownloadLink.rfind('/')
216  videoFileName = u'http://video.ted.com/talks/dynamic%s' % tmpDownloadLink[index:].replace('_480', u'').replace('.mp4', u'')
217  videoFileName+=u'-%s.flv' % parmDict['flv']
218  return self.flvPlayerLink % videoFileName
219  # end tedtalksMakeLink()
220 
221  def tedtalksTitleRSS(self, context, *arg):
222  '''Return item elements that were previously created in "tedtalksMakeItem" call
223  Call example: 'mnvXpath:tedtalksTitleRSS(string(title))'
224  return a massaged title string
225  '''
226  title = arg[0]
227  index = title.rfind('-')
228  if index == -1:
229  return title
230  return title[:index].strip()
231  # end tedtalksTitleRSS()
232 
233  def stripSubstring(self, string, startText, terminatorChar):
234  '''Return a substring terminated by specific character(s)
235  return a substring
236  '''
237  index = string.find(startText)
238  if index == -1:
239  return u''
240  string = string[index+len(startText):]
241  index = string.find(terminatorChar)
242  if index == -1:
243  return u''
244  return string[:index].strip()
245  # end stripSubstring()
246 
247  def parameterArgs(self, parameters, terminatorChar=u';'):
248  '''Set the parameters for TedTalks
249  return a dictionary of parameters
250  '''
251  paramDict = {}
252  args = parameters.split(terminatorChar)
253  for arg in args:
254  tmp = arg.split('=')
255  paramDict[tmp[0]] = tmp[1]
256  return paramDict
257  # end parameterArgs()
258 
259 
264 
265 
270 
271 
def tedtalksMakeItem(self, context, *arg)
Start of XPath extension functions.
static pid_list_t::iterator find(const PIDInfoMap &map, pid_list_t &list, pid_list_t::iterator begin, pid_list_t::iterator end, bool find_open)
def stripSubstring(self, string, startText, terminatorChar)
def parameterArgs(self, parameters, terminatorChar=u';')