MythTV  master
tributeca_api.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: UTF-8 -*-
3 # ----------------------
4 # Name: tributeca_api - XPath and XSLT functions for the Tribute.ca grabber
5 # Python Script
6 # Author: R.D. Vaughan
7 # Purpose: This python script is intended to perform a variety of utility functions
8 # for the conversion of data to the MNV standard RSS output format.
9 # See this link for the specifications:
10 # http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
11 #
12 # License:Creative Commons GNU GPL v2
13 # (http://creativecommons.org/licenses/GPL/2.0/)
14 #-------------------------------------
15 __title__ ="tributeca_api - XPath and XSLT functions for the Tribute.ca grabber"
16 __author__="R.D. Vaughan"
17 __purpose__='''
18 This python script is intended to perform a variety of utility functions
19 for the conversion of data to the MNV standard RSS output format.
20 See this link for the specifications:
21 http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format
22 '''
23 
24 __version__="v0.1.1"
25 # 0.1.0 Initial development
26 # 0.1.1 Changes to due to Web site modifications
27 
28 
29 # Specify the class names that have XPath extention functions
30 __xpathClassList__ = ['xpathFunctions', ]
31 
32 # Specify the XSLT extention class names. Each class is a stand lone extention function
33 #__xsltExtentionList__ = ['xsltExtExample', ]
34 __xsltExtentionList__ = []
35 
36 import os, sys, re, time, datetime, shutil, urllib, string
37 from copy import deepcopy
38 
39 
40 class OutStreamEncoder(object):
41  """Wraps a stream with an encoder"""
42  def __init__(self, outstream, encoding=None):
43  self.out = outstream
44  if not encoding:
45  self.encoding = sys.getfilesystemencoding()
46  else:
47  self.encoding = encoding
48 
49  def write(self, obj):
50  """Wraps the output stream, encoding Unicode strings with the specified encoding"""
51  if isinstance(obj, unicode):
52  try:
53  self.out.write(obj.encode(self.encoding))
54  except IOError:
55  pass
56  else:
57  try:
58  self.out.write(obj)
59  except IOError:
60  pass
61 
62  def __getattr__(self, attr):
63  """Delegate everything but write to the stream"""
64  return getattr(self.out, attr)
65 sys.stdout = OutStreamEncoder(sys.stdout, 'utf8')
66 sys.stderr = OutStreamEncoder(sys.stderr, 'utf8')
67 
68 try:
69  from StringIO import StringIO
70  from lxml import etree
71 except Exception, e:
72  sys.stderr.write(u'\n! Error - Importing the "lxml" and "StringIO" python libraries failed on error(%s)\n' % e)
73  sys.exit(1)
74 
75 # Check that the lxml library is current enough
76 # From the lxml documents it states: (http://codespeak.net/lxml/installation.html)
77 # "If you want to use XPath, do not use libxml2 2.6.27. We recommend libxml2 2.7.2 or later"
78 # Testing was performed with the Ubuntu 9.10 "python-lxml" version "2.1.5-1ubuntu2" repository package
79 version = ''
80 for digit in etree.LIBXML_VERSION:
81  version+=str(digit)+'.'
82 version = version[:-1]
83 if version < '2.7.2':
84  sys.stderr.write(u'''
85 ! Error - The installed version of the "lxml" python library "libxml" version is too old.
86  At least "libxml" version 2.7.2 must be installed. Your version is (%s).
87 ''' % version)
88  sys.exit(1)
89 
90 
91 class xpathFunctions(object):
92  """Functions specific extending XPath
93  """
94  def __init__(self):
95  self.functList = ['tributecaLinkGeneration', 'tributecaThumbnailLink', 'tributecaTopTenTitle', 'tributecaIsCustomHTML', 'tributecaCheckIfDBItem', 'tributecaDebug', 'tributecaGetAnchors', ]
96  self.TextTail = etree.XPath("string()")
97  self.anchorList = etree.XPath(".//a", namespaces=common.namespaces)
98  self.persistence = {}
99  # end __init__()
100 
101 
106 
107  def tributecaLinkGeneration(self, context, *args):
108  '''Generate a link for the Tribute.ca site. Sigificant massaging of the title is required.
109  Call example: 'mnvXpath:tributecaLinkGeneration(position(), ..//a)'
110  return the url link
111  '''
112  downloadURL = u'http://www.tribute.ca/streamingflash/%s.flv'
113  position = int(args[0])-1
114  webURL = u'http://www.tribute.ca%s' % args[1][position].attrib['href'].strip()
115 
116  # If this is for the download then just return what was found for the "link" element
117  if self.persistence.has_key('tributecaLinkGeneration'):
118  if self.persistence['tributecaLinkGeneration'] != None:
119  returnValue = self.persistence['tributecaLinkGeneration']
120  self.persistence['tributecaLinkGeneration'] = None
121  if returnValue != webURL:
122  return downloadURL % returnValue
123  else:
124  return webURL
125 
126  currentTitle = self.TextTail(args[1][position]).strip()
127  if position == 0:
128  previousTitle = u''
129  else:
130  previousTitle = self.TextTail(args[1][position-1]).strip()
131 
132  # Rule: "IMAX: Hubble 3D": http://www.tribute.ca/streamingflash/hubble3d.flv
133  titleArray = [currentTitle, previousTitle]
134  if titleArray[0].startswith(u'IMAX:'):
135  titleArray[0] = titleArray[0].replace(u'IMAX:', u'').strip()
136  else:
137  # Rule: "How to Train Your Dragon: An IMAX 3D Experience" did not even have a trailer
138  # on the Web page but stip off anything after the ":"
139  for counter in range(len(titleArray)):
140  index = titleArray[counter].find(": ")
141  if index != -1:
142  titleArray[counter] = titleArray[counter][:index].strip()
143  index = titleArray[counter].find(" (")
144  if index != -1:
145  titleArray[counter] = titleArray[counter][:index].strip()
146  if titleArray[0].startswith(titleArray[1]) and titleArray[1]:
147  index = titleArray[counter].find("3D")
148  if index != -1:
149  titleArray[counter] = titleArray[counter][:index].strip()
150 
151  # If the previous title starts with the same title as the current then this is trailer #2
152  trailer2 = u''
153  if titleArray[0].startswith(titleArray[1]) and titleArray[1]:
154  trailer2 = u'tr2'
155  if currentTitle.find(': An IMAX') != -1:
156  trailer2 = u'tr2'
157  titleArray[0] = titleArray[0].replace(u'&', u'and')
158  self.persistence['tributecaThumbnailLink'] = urllib.quote_plus(titleArray[0].lower().replace(u' ', u'_').replace(u"'", u'').replace(u'-', u'_').replace(u'?', u'').replace(u'.', u'').encode("utf-8"))
159  titleArray[0] = urllib.quote_plus(re.sub('[%s]' % re.escape(string.punctuation), '', titleArray[0].lower().replace(u' ', u'').encode("utf-8")))
160 
161  # Verify that the FLV file url really exits. If it does not then use the Web page link.
162  videocode = u'%s%s' % (titleArray[0], trailer2)
163  flvURL = downloadURL % videocode
164  resultCheckUrl = common.checkURL(flvURL)
165  if not resultCheckUrl[0] or resultCheckUrl[1]['Content-Type'] != u'video/x-flv':
166  if trailer2 != u'':
167  videocode = titleArray[0]
168  flvURL = downloadURL % titleArray[0]
169  resultCheckUrl = common.checkURL(flvURL) # Drop the 'tr2' this time
170  if not resultCheckUrl[0] or resultCheckUrl[1]['Content-Type'] != u'video/x-flv':
171  flvURL = webURL
172  else:
173  videocode = titleArray[0]+u'tr2'
174  flvURL = downloadURL % videocode
175  resultCheckUrl = common.checkURL(flvURL) # Add the 'tr2' this time
176  if not resultCheckUrl[0] or resultCheckUrl[1]['Content-Type'] != u'video/x-flv':
177  if currentTitle.find(': An IMAX') == -1 and currentTitle.find(': ') != -1:
178  titleArray[0] = currentTitle.replace(u'&', u'and')
179  titleArray[0] = urllib.quote_plus(re.sub('[%s]' % re.escape(string.punctuation), '', titleArray[0].lower().replace(u' ', u'').encode("utf-8")))
180  videocode = titleArray[0]
181  flvURL = downloadURL % videocode
182  resultCheckUrl = common.checkURL(flvURL) # Add the 'tr2' this time
183  if not resultCheckUrl[0] or resultCheckUrl[1]['Content-Type'] != u'video/x-flv':
184  flvURL = webURL
185  else:
186  flvURL = webURL
187  if flvURL != webURL:
188  self.persistence['tributecaLinkGeneration'] = videocode
189  return common.linkWebPage(u'dummycontext', 'tributeca')+videocode
190  else:
191  self.persistence['tributecaLinkGeneration'] = flvURL
192  return flvURL
193  # end linkGeneration()
194 
195  def tributecaThumbnailLink(self, context, *args):
196  '''Verify that the thumbnail actually exists. If it does not then use the site image.
197  Call example: 'mnvXpath:tributecaThumbnailLink(string(.//img/@src))'
198  return the thumbnail url
199  '''
200  siteImage = u'http://www.tribute.ca/images/tribute_title.gif'
201  if not len(args[0]) or not self.persistence['tributecaThumbnailLink']:
202  return siteImage
203 
204  if args[0].startswith(u'http:'):
205  url = args[0].strip()
206  else:
207  url = u'http://www.tribute.ca/tribute_objects/images/movies/%s%s' % (self.persistence['tributecaThumbnailLink'], u'/poster.jpg')
208  resultCheckUrl = common.checkURL(url)
209  if not resultCheckUrl[0] or resultCheckUrl[1]['Content-Type'] != u'image/jpeg':
210  return siteImage
211 
212  return url
213  # end tributecaThumbnailLink()
214 
215  def tributecaTopTenTitle(self, context, *args):
216  '''Take a top ten title and add a leading '0' if less than 10 as it forces correct sort order
217  Call example: 'mnvXpath:tributecaTopTenTitle(string(..))'
218  return a replacement title
219  '''
220  if not len(args[0]):
221  return args[0]
222 
223  index = args[0].find('.')
224  if index == 1:
225  return u'0'+args[0]
226  else:
227  return args[0]
228  # end tributecaTopTenTitle()
229 
230  def tributecaIsCustomHTML(self, context, *args):
231  '''Check if the link is for a custom HTML
232  Example call: mnvXpath:isCustomHTML(('dummy'))
233  return True if the link does not starts with "http://"
234  return False if the link starts with "http://"
235  '''
236  if self.persistence['tributecaLinkGeneration'] == None:
237  return False
238 
239  if self.persistence['tributecaLinkGeneration'].startswith(u'http://'):
240  return False
241  else:
242  return True
243  # end isCustomHTML()
244 
245  def tributecaCheckIfDBItem(self, context, *arg):
246  '''Use a unique key value pairing to find out if the 'internetcontentarticles' table already
247  has a matching item. This is done to save accessing the Internet when not required.
248  Call example: 'mnvXpath:tributecaCheckIfDBItem(.)'
249  return True if a match was found
250  return False if a match was not found
251  '''
252  return common.checkIfDBItem('dummy', {'feedtitle': 'Movie Trailers', 'title': arg[0].replace('Trailer', u'').strip(), 'author': arg[1], 'description': arg[2]})
253  # end tributecaCheckIfDBItem()
254 
255  def tributecaGetAnchors(self, context, *arg):
256  ''' Routine used to get specific anchor elements.
257  Unfortunitely position dependant.
258  Call: mnvXpath:tributecaGetAnchors(//ul[@class='clump'], 3)
259  '''
260  return self.anchorList(arg[0][int(arg[1])])
261  # end tributecaGetAnchors()
262 
263  def tributecaDebug(self, context, *arg):
264  ''' Routine only used for debugging. Prints out the node
265  passed as an argument. Not to be used in production.
266  Call example: mnvXpath:tributecaDebug(//a)
267  '''
268  testpath = etree.XPath(".//a", namespaces=common.namespaces)
269  print arg
270  count = 0
271  for x in arg:
272  sys.stdout.write(u'\nElement Count (%s):\n' % count)
273 # for y in testpath(x):
274 # sys.stdout.write(etree.tostring(y, encoding='UTF-8', pretty_print=True))
275  print "testpath(%s)" % testpath(x)
276  count+=1
277  print
278 # sys.stdout.write(etree.tostring(arg[0], encoding='UTF-8', pretty_print=True))
279  return u"========tributecaDebug Called========="
280  # end tributecaDebug()
281 
282 
287 
288 
293 
294 class xsltExtExample(etree.XSLTExtension):
295  '''Example of an XSLT extension. This code must be changed to do anything useful!!!
296  return nothing
297  '''
298  def execute(self, context, self_node, input_node, output_parent):
299  copyItem = deepcopy(input_node)
300  min_sec = copyItem.xpath('duration')[0].text.split(':')
301  seconds = 0
302  for count in range(len(min_sec)):
303  seconds+=int(min_sec[count])*(60*(len(min_sec)-count-1))
304  output_parent.text = u'%s' % seconds
305 
306 
def execute(self, context, self_node, input_node, output_parent)
static pid_list_t::iterator find(const PIDInfoMap &map, pid_list_t &list, pid_list_t::iterator begin, pid_list_t::iterator end, bool find_open)
def tributecaLinkGeneration(self, context, *args)
Start of XPath extension functions.