MythTV  master
common_api.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: UTF-8 -*-
3 # ----------------------
4 # Name: common_api.py - Common class libraries for all MythNetvision Mashup processing
5 # Python Script
6 # Author: R.D. Vaughan
7 # Purpose: This python script contains a number of common functions used for processing MythNetvision
8 # Grabbers.
9 #
10 # License:Creative Commons GNU GPL v2
11 # (http://creativecommons.org/licenses/GPL/2.0/)
12 #-------------------------------------
13 __title__ ="common_api - Common class libraries for all MythNetvision Mashup processing"
14 __author__="R.D. Vaughan"
15 __purpose__='''
16 This python script is intended to perform a variety of utility functions for the processing of
17 MythNetvision Grabber scripts that run as a Web application and global functions used by many
18 MNV grabbers.
19 '''
20 
21 __version__="v0.2.3"
22 # 0.0.1 Initial development
23 # 0.1.0 Alpha release
24 # 0.1.1 Added the ability to have a mashup name independant of the mashup title
25 # Added passing on the url the emml hostname and port so a mashup can call other emml mashups
26 # 0.1.2 Modifications to support launching single treeview requests for better integration with MNV
27 # subscription logic.
28 # With the change to allow MNV launching individual tree views the auto shutdown feature had to be
29 # disabled. Unless a safe work around can be found the feature may need to be removed entierly.
30 # 0.1.3 Modifications to support calling grabbers that run on a Web server
31 # Added a class of global functions that could be used by all grabbers
32 # 0.1.4 Changed the rating item element to default to be empty rather than "0.0"
33 # Changed the default logger to stderr
34 # 0.1.5 Added functions and data structures for common "Mashups" grabbers
35 # Changed the api name from "mashups_api" to "common_api"
36 # Added XSLT stylsheets as an alternate process option in the threaded URL download functions
37 # 0.1.6 Removed all logic associated with Web CGI calls as the MNV plugin is now on the backend
38 # Made the pubDate fucntion more adaptable to various input date strings
39 # 0.1.7 Added a common function to get the current selected language (default is 'en' English)
40 # 0.1.8 Fixed a bug with two string functions
41 # Added a customhtml reference for bliptv
42 # 0.1.9 Add a function that allows grabbers to check if an item is already in the data base. This is used
43 # to make grabbers more efficient when processing sources that are largely made up of the same
44 # data. This is particularly important when a grabber is forces to do additional Interent accesses
45 # to aquire all the needed MNV item data.
46 # Added a function that checks if there are any treeview items in the data base for a specific
47 # grabber. Some Mashup grabber's search option only returns results when then there are treeview
48 # items in the database.
49 # 0.2.0 Made the creation of custom HTML page links more flexible so code did not need to be changed
50 # when new custom HTML pages were added.
51 # 0.2.1 Add the ability for a parameters to be passed to a XSLT style sheet
52 # 0.2.2 Added a common XPath extention to test if a string starts or ends with a substring
53 # 0.2.3 Fixed Error messages that were not unicode strings
54 
55 import os, struct, sys, re, datetime, time, subprocess, string
56 import urllib
57 import logging
58 import telnetlib
59 from threading import Thread
60 
61 from common_exceptions import (WebCgiUrlError, WebCgiHttpError, WebCgiRssError, WebCgiVideoNotFound, WebCgiXmlError, )
62 
63 class OutStreamEncoder(object):
64  """Wraps a stream with an encoder"""
65  def __init__(self, outstream, encoding=None):
66  self.out = outstream
67  if not encoding:
68  self.encoding = sys.getfilesystemencoding()
69  else:
70  self.encoding = encoding
71 
72  def write(self, obj):
73  """Wraps the output stream, encoding Unicode strings with the specified encoding"""
74  if isinstance(obj, unicode):
75  try:
76  self.out.write(obj.encode(self.encoding))
77  except IOError:
78  pass
79  else:
80  try:
81  self.out.write(obj)
82  except IOError:
83  pass
84 
85  def __getattr__(self, attr):
86  """Delegate everything but write to the stream"""
87  return getattr(self.out, attr)
88 sys.stdout = OutStreamEncoder(sys.stdout, 'utf8')
89 sys.stderr = OutStreamEncoder(sys.stderr, 'utf8')
90 
91 
92 try:
93  from StringIO import StringIO
94  from lxml import etree
95 except Exception, e:
96  sys.stderr.write(u'\n! Error - Importing the "lxml" python library failed on error(%s)\n' % e)
97  sys.exit(1)
98 
99 # Check that the lxml library is current enough
100 # From the lxml documents it states: (http://codespeak.net/lxml/installation.html)
101 # "If you want to use XPath, do not use libxml2 2.6.27. We recommend libxml2 2.7.2 or later"
102 # Testing was performed with the Ubuntu 9.10 "python-lxml" version "2.1.5-1ubuntu2" repository package
103 # >>> from lxml import etree
104 # >>> print "lxml.etree:", etree.LXML_VERSION
105 # lxml.etree: (2, 1, 5, 0)
106 # >>> print "libxml used:", etree.LIBXML_VERSION
107 # libxml used: (2, 7, 5)
108 # >>> print "libxml compiled:", etree.LIBXML_COMPILED_VERSION
109 # libxml compiled: (2, 6, 32)
110 # >>> print "libxslt used:", etree.LIBXSLT_VERSION
111 # libxslt used: (1, 1, 24)
112 # >>> print "libxslt compiled:", etree.LIBXSLT_COMPILED_VERSION
113 # libxslt compiled: (1, 1, 24)
114 
115 version = ''
116 for digit in etree.LIBXML_VERSION:
117  version+=str(digit)+'.'
118 version = version[:-1]
119 if version < '2.7.2':
120  sys.stderr.write(u'''
121 ! Error - The installed version of the "lxml" python library "libxml" version is too old.
122  At least "libxml" version 2.7.2 must be installed. Your version is (%s).
123 ''' % version)
124  sys.exit(1)
125 
126 
127 
132 class Common(object):
133  """A collection of common functions used by many grabbers
134  """
135  def __init__(self,
136  logger=False,
137  debug=False,
138  ):
139  self.logger = logger
140  self.debug = debug
141  self.baseProcessingDir = os.path.dirname( os.path.realpath( __file__ )).replace(u'/nv_python_libs/common', u'')
142  self.namespaces = {
143  'xsi': u"http://www.w3.org/2001/XMLSchema-instance",
144  'media': u"http://search.yahoo.com/mrss/",
145  'xhtml': u"http://www.w3.org/1999/xhtml",
146  'atm': u"http://www.w3.org/2005/Atom",
147  'mythtv': "http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format",
148  'itunes':"http://www.itunes.com/dtds/podcast-1.0.dtd",
149  }
150  self.parsers = {
151  'xml': etree.XMLParser(remove_blank_text=True),
152  'html': etree.HTMLParser(remove_blank_text=True),
153  'xhtml': etree.HTMLParser(remove_blank_text=True),
154  }
155  self.pubDateFormat = u'%a, %d %b %Y %H:%M:%S GMT'
156  self.mnvRSS = u"""
157 <rss version="2.0"
158  xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"
159  xmlns:content="http://purl.org/rss/1.0/modules/content/"
160  xmlns:cnettv="http://cnettv.com/mrss/"
161  xmlns:creativeCommons="http://backend.userland.com/creativeCommonsRssModule"
162  xmlns:media="http://search.yahoo.com/mrss/"
163  xmlns:atom="http://www.w3.org/2005/Atom"
164  xmlns:amp="http://www.adobe.com/amp/1.0"
165  xmlns:dc="http://purl.org/dc/elements/1.1/"
166  xmlns:mythtv="http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format">
167 """
168  self.mnvItem = u'''
169 <item>
170  <title></title>
171  <author></author>
172  <pubDate></pubDate>
173  <description></description>
174  <link></link>
175  <media:group xmlns:media="http://search.yahoo.com/mrss/">
176  <media:thumbnail url=''/>
177  <media:content url='' length='' duration='' width='' height='' lang=''/>
178  </media:group>
179  <rating></rating>
180 </item>
181 '''
182  # Season and Episode detection regex patterns
183  self.s_e_Patterns = [
184  # "Series 7 - Episode 4" or "Series 7 - Episode 4" or "Series 7: On Holiday: Episode 10"
185  re.compile(u'''^.+?Series\\ (?P<seasno>[0-9]+).*.+?Episode\\ (?P<epno>[0-9]+).*$''', re.UNICODE),
186  # Series 5 - 1
187  re.compile(u'''^.+?Series\\ (?P<seasno>[0-9]+)\\ \\-\\ (?P<epno>[0-9]+).*$''', re.UNICODE),
188  # Series 1 - Warriors of Kudlak - Part 2
189  re.compile(u'''^.+?Series\\ (?P<seasno>[0-9]+).*.+?Part\\ (?P<epno>[0-9]+).*$''', re.UNICODE),
190  # Series 3: Programme 3
191  re.compile(u'''^.+?Series\\ (?P<seasno>[0-9]+)\\:\\ Programme\\ (?P<epno>[0-9]+).*$''', re.UNICODE),
192  # Series 3:
193  re.compile(u'''^.+?Series\\ (?P<seasno>[0-9]+).*$''', re.UNICODE),
194  # Episode 1
195  re.compile(u'''^.+?Episode\\ (?P<seasno>[0-9]+).*$''', re.UNICODE),
196  # Title: "s18 | e87"
197  re.compile(u'''^.+?[Ss](?P<seasno>[0-9]+).*.+?[Ee](?P<epno>[0-9]+).*$''', re.UNICODE),
198  # Description: "season 1, episode 5"
199  re.compile(u'''^.+?season\ (?P<seasno>[0-9]+).*.+?episode\ (?P<epno>[0-9]+).*$''', re.UNICODE),
200  # Thumbnail: "http://media.thewb.com/thewb/images/thumbs/firefly/01/firefly_01_07.jpg"
201  re.compile(u'''(?P<seriesname>[^_]+)\\_(?P<seasno>[0-9]+)\\_(?P<epno>[0-9]+).*$''', re.UNICODE),
202  # Guid: "http://traffic.libsyn.com/divefilm/episode54hd.m4v"
203  re.compile(u'''^.+?episode(?P<epno>[0-9]+).*$''', re.UNICODE),
204  # Season 3, Episode 8
205  re.compile(u'''^.+?Season\\ (?P<seasno>[0-9]+).*.+?Episode\\ (?P<epno>[0-9]+).*$''', re.UNICODE),
206  # "Episode 1" anywhere in text
207  re.compile(u'''^.+?Episode\\ (?P<seasno>[0-9]+).*$''', re.UNICODE),
208  # "Episode 1" at the start of the text
209  re.compile(u'''Episode\\ (?P<seasno>[0-9]+).*$''', re.UNICODE),
210  # "--0027--" when the episode is in the URL link
211  re.compile(u'''^.+?--(?P<seasno>[0-9]+)--.*$''', re.UNICODE),
212  ]
213  self.nv_python_libs_path = u'nv_python_libs'
214  self.apiSuffix = u'_api'
215  self.language = u'en'
216  self.mythdb = None
217  self.linksWebPage = None
218  self.etree = etree
219  # end __init__()
220 
221  def massageText(self, text):
222  '''Removes HTML markup from a text string.
223  @param text The HTML source.
224  @return The plain text. If the HTML source contains non-ASCII
225  entities or character references, this is a Unicode string.
226  '''
227  def fixup(m):
228  text = m.group(0)
229  if text[:1] == "<":
230  return "" # ignore tags
231  if text[:2] == "&#":
232  try:
233  if text[:3] == "&#x":
234  return unichr(int(text[3:-1], 16))
235  else:
236  return unichr(int(text[2:-1]))
237  except ValueError:
238  pass
239  elif text[:1] == "&":
240  import htmlentitydefs
241  entity = htmlentitydefs.entitydefs.get(text[1:-1])
242  if entity:
243  if entity[:2] == "&#":
244  try:
245  return unichr(int(entity[2:-1]))
246  except ValueError:
247  pass
248  else:
249  return unicode(entity, "iso-8859-1")
250  return text # leave as is
251  return self.ampReplace(re.sub(u"(?s)<[^>]*>|&#?\w+;", fixup, self.textUtf8(text))).replace(u'\n',u' ')
252  # end massageText()
253 
254 
255  def initLogger(self, path=sys.stderr, log_name=u'MNV_Grabber'):
256  """Setups a logger using the logging module, returns a logger object
257  """
258  logger = logging.getLogger(log_name)
259  formatter = logging.Formatter('%(asctime)s-%(levelname)s: %(message)s', '%Y-%m-%dT%H:%M:%S')
260 
261  if path == sys.stderr:
262  hdlr = logging.StreamHandler(sys.stderr)
263  else:
264  hdlr = logging.FileHandler(u'%s/%s.log' % (path, log_name))
265 
266  hdlr.setFormatter(formatter)
267  logger.addHandler(hdlr)
268 
269  if self.debug:
270  logger.setLevel(logging.DEBUG)
271  else:
272  logger.setLevel(logging.INFO)
273  self.logger = logger
274  return logger
275  #end initLogger
276 
277 
278  def textUtf8(self, text):
279  if text == None:
280  return text
281  try:
282  return unicode(text, 'utf8')
283  except UnicodeDecodeError:
284  return u''
285  except (UnicodeEncodeError, TypeError):
286  return text
287  # end textUtf8()
288 
289 
290  def ampReplace(self, text):
291  '''Replace all &, ', ", <, and > characters with the predefined XML
292  entities
293  '''
294  text = self.textUtf8(text)
295  text = text.replace(u'&amp;',u'~~~~~').replace(u'&',u'&amp;').replace(u'~~~~~', u'&amp;')
296  text = text.replace(u"'", u"&apos;").replace(u'"', u'&quot;')
297  text = text.replace(u'<', u'&lt;').replace(u'>', u'&gt;')
298  return text
299  # end ampReplace()
300 
301  def callCommandLine(self, command, stderr=False):
302  '''Perform the requested command line and return an array of stdout strings and
303  stderr strings if stderr=True
304  return array of stdout string array or stdout and stderr string arrays
305  '''
306  stderrarray = []
307  stdoutarray = []
308  try:
309  p = subprocess.Popen(command, shell=True, bufsize=4096, stdin=subprocess.PIPE,
310  stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
311  except Exception, e:
312  if self.logger:
313  self.logger.error(u'callCommandLine Popen Exception, error(%s)' % e)
314  if stderr:
315  return [[], []]
316  else:
317  return []
318 
319  if stderr:
320  while True:
321  data = p.stderr.readline()
322  if not data:
323  break
324  try:
325  data = unicode(data, 'utf8')
326  except (UnicodeDecodeError):
327  continue # Skip any line is cannot be cast as utf8 characters
328  except (UnicodeEncodeError, TypeError):
329  pass
330  stderrarray.append(data)
331 
332  while True:
333  data = p.stdout.readline()
334  if not data:
335  break
336  try:
337  data = unicode(data, 'utf8')
338  except (UnicodeDecodeError):
339  continue # Skip any line that has non-utf8 characters in it
340  except (UnicodeEncodeError, TypeError):
341  pass
342  stdoutarray.append(data)
343 
344  if stderr:
345  return [stdoutarray, stderrarray]
346  else:
347  return stdoutarray
348  # end callCommandLine()
349 
350 
352  '''Get longitude and latitiude to find videos relative to your location. Up to three different
353  servers will be tried before giving up.
354  return a dictionary e.g.
355  {'Latitude': '43.6667', 'Country': 'Canada', 'Longitude': '-79.4167', 'City': 'Toronto'}
356  return an empty dictionary if there were any errors
357  Code found at: http://blog.suinova.com/2009/04/from-ip-to-geolocation-country-city.html
358  '''
359  def getExternalIP():
360  '''Find the external IP address of this computer.
361  '''
362  url = urllib.URLopener()
363  try:
364  resp = url.open('http://www.whatismyip.com/automation/n09230945.asp')
365  return resp.read()
366  except:
367  return None
368  # end getExternalIP()
369 
370  ip = getExternalIP()
371 
372  if ip == None:
373  return {}
374 
375  try:
376  gs = urllib.urlopen('http://blogama.org/ip_query.php?ip=%s&output=xml' % ip)
377  txt = gs.read()
378  except:
379  try:
380  gs = urllib.urlopen('http://www.seomoz.org/ip2location/look.php?ip=%s' % ip)
381  txt = gs.read()
382  except:
383  try:
384  gs = urllib.urlopen('http://api.hostip.info/?ip=%s' % ip)
385  txt = gs.read()
386  except:
387  logging.error('GeoIP servers not available')
388  return {}
389  try:
390  if txt.find('<Response>') > 0:
391  countrys = re.findall(r'<CountryName>([\w ]+)<',txt)[0]
392  citys = re.findall(r'<City>([\w ]+)<',txt)[0]
393  lats,lons = re.findall(r'<Latitude>([\d\-\.]+)</Latitude>\s*<Longitude>([\d\-\.]+)<',txt)[0]
394  elif txt.find('GLatLng') > 0:
395  citys,countrys = re.findall('<br />\s*([^<]+)<br />\s*([^<]+)<',txt)[0]
396  lats,lons = re.findall('LatLng\(([-\d\.]+),([-\d\.]+)',txt)[0]
397  elif txt.find('<gml:coordinates>') > 0:
398  citys = re.findall('<Hostip>\s*<gml:name>(\w+)</gml:name>',txt)[0]
399  countrys = re.findall('<countryName>([\w ,\.]+)</countryName>',txt)[0]
400  lats,lons = re.findall('gml:coordinates>([-\d\.]+),([-\d\.]+)<',txt)[0]
401  else:
402  logging.error('error parsing IP result %s'%txt)
403  return {}
404  return {'Country':countrys,'City':citys,'Latitude':lats,'Longitude':lons}
405  except:
406  logging.error('Error parsing IP result %s'%txt)
407  return {}
408  # end detectUserLocationByIP()
409 
410 
411  def displayCustomHTML(self):
412  """Common name for a custom HTML display. Used to interface with MythTV plugin NetVision
413  """
414  embedFlashVarFilter = etree.XPath('//embed', namespaces=self.namespaces)
415  variables = self.HTMLvideocode.split(u'?')
416 
417  url = u'%s/nv_python_libs/configs/HTML/%s' % (baseProcessingDir, variables[0])
418  try:
419  customHTML = etree.parse(url)
420  except Exception, e:
421  raise Exception(u"! Error: The Custom HTML file (%s) cause the exception error (%s)\n" % (url, errormsg))
422 
423  # There may be one or more argumants to replace in the HTML code
424  # Example:
425  # "bbciplayer.html?AttribName1/FirstReplace=bc623bc?SecondReplace/AttribName2=wonderland/..."
426  for arg in variables[1:]:
427  (attrib, key_value) = arg.split(u'/')
428  (key, value) = key_value.split(u'=')
429  embedFlashVarFilter(customHTML)[0].attrib[attrib] = embedFlashVarFilter(customHTML)[0].attrib[attrib].replace(key, value)
430 
431  sys.stdout.write(etree.tostring(customHTML, encoding='UTF-8', pretty_print=True))
432 
433  sys.exit(0)
434  # end displayCustomHTML()
435 
436 
437  def mnvChannelElement(self, channelDetails):
438  ''' Create a MNV Channel element populated with channel details
439  return the channel element
440  '''
441  mnvChannel = etree.fromstring(u"""
442 <channel>
443  <title>%(channel_title)s</title>
444  <link>%(channel_link)s</link>
445  <description>%(channel_description)s</description>
446  <numresults>%(channel_numresults)d</numresults>
447  <returned>%(channel_returned)d</returned>
448  <startindex>%(channel_startindex)d</startindex>
449 </channel>
450 """ % channelDetails
451  )
452  return mnvChannel
453  # end mnvChannelElement()
454 
455  # Verify the a URL actually exists
456  def checkURL(self, url):
457  '''Verify that a URL actually exists. Be careful as redirects can lead to false positives. Use
458  the info details to be sure.
459  return True when it exists and info
460  return False when it does not exist and info
461  '''
462  urlOpened = urllib.urlopen(url)
463  code = urlOpened.getcode()
464  actualURL = urlOpened.geturl()
465  info = urlOpened.info()
466  urlOpened.close()
467  if code != 200:
468  return [False, info]
469  if url != actualURL:
470  return [False, info]
471  return [True, info]
472  # end checkURL()
473 
474 
475  def getUrlData(self, inputUrls, pageFilter=None):
476  ''' Fetch url data and extract the desired results using a dynamic filter or XSLT stylesheet.
477  The URLs are requested in parallel using threading
478  return the extracted data organised into directories
479  '''
480  urlDictionary = {}
481 
482  if self.debug:
483  print "inputUrls:"
484  sys.stdout.write(etree.tostring(inputUrls, encoding='UTF-8', pretty_print=True))
485  print
486 
487  for element in inputUrls.xpath('.//url'):
488  key = element.find('name').text
489  urlDictionary[key] = {}
490  urlDictionary[key]['type'] = 'raw'
491  urlDictionary[key]['href'] = element.find('href').text
492  urlFilter = element.findall('filter')
493  if len(urlFilter):
494  urlDictionary[key]['type'] = 'xpath'
495  for index in range(len(urlFilter)):
496  urlFilter[index] = urlFilter[index].text
497  urlDictionary[key]['filter'] = urlFilter
498  urlXSLT = element.findall('xslt')
499  if len(urlXSLT):
500  urlDictionary[key]['type'] = 'xslt'
501  for index in range(len(urlXSLT)):
502  urlXSLT[index] = etree.XSLT(etree.parse(u'%s/nv_python_libs/configs/XSLT/%s.xsl' % (self.baseProcessingDir, urlXSLT[index].text)))
503  urlDictionary[key]['xslt'] = urlXSLT
504  urlDictionary[key]['pageFilter'] = pageFilter
505  urlDictionary[key]['parser'] = self.parsers[element.find('parserType').text].copy()
506  urlDictionary[key]['namespaces'] = self.namespaces
507  urlDictionary[key]['result'] = []
508  urlDictionary[key]['morePages'] = u'false'
509  urlDictionary[key]['tmp'] = None
510  urlDictionary[key]['tree'] = None
511  if element.find('parameter') != None:
512  urlDictionary[key]['parameter'] = element.find('parameter').text
513 
514  if self.debug:
515  print "urlDictionary:"
516  print urlDictionary
517  print
518 
519  thread_list = []
520  getURL.urlDictionary = urlDictionary
521 
522  # Single threaded (commented out) - Only used to prove that multi-threading does
523  # not cause data corruption
524 # for key in urlDictionary.keys():
525 # current = getURL(key, self.debug)
526 # thread_list.append(current)
527 # current.start()
528 # current.join()
529 
530  # Multi-threaded
531  for key in urlDictionary.keys():
532  current = getURL(key, self.debug)
533  thread_list.append(current)
534  current.start()
535  for thread in thread_list:
536  thread.join()
537 
538  # Take the results and make the return element tree
539  root = etree.XML(u"<xml></xml>")
540  for key in sorted(getURL.urlDictionary.keys()):
541  if not len(getURL.urlDictionary[key]['result']):
542  continue
543  results = etree.SubElement(root, "results")
544  etree.SubElement(results, "name").text = key
545  etree.SubElement(results, "url").text = urlDictionary[key]['href']
546  etree.SubElement(results, "type").text = urlDictionary[key]['type']
547  etree.SubElement(results, "pageInfo").text = getURL.urlDictionary[key]['morePages']
548  result = etree.SubElement(results, "result")
549  if len(getURL.urlDictionary[key]['filter']):
550  for index in range(len(getURL.urlDictionary[key]['result'])):
551  for element in getURL.urlDictionary[key]['result'][index]:
552  result.append(element)
553  elif len(getURL.urlDictionary[key]['xslt']):
554  for index in range(len(getURL.urlDictionary[key]['result'])):
555  for element in getURL.urlDictionary[key]['result'][index].getroot():
556  result.append(element)
557  else:
558  for element in getURL.urlDictionary[key]['result'][0].xpath('/*'):
559  result.append(element)
560 
561  if self.debug:
562  print "root:"
563  sys.stdout.write(etree.tostring(root, encoding='UTF-8', pretty_print=True))
564  print
565 
566  return root
567  # end getShows()
568 
569 
572  def buildFunctionDict(self):
573  ''' Create a dictionary of functions that manipulate items data. These functions are imported
574  from other MNV grabbers. These functions are meant to be used by the MNV WebCgi type of grabber
575  which aggregates data from a number of different sources (e.g. RSS feeds and HTML Web pages)
576  including sources from other grabbers.
577  Using a dictionary facilitates mixing XSLT functions with pure python functions to use the best
578  capabilities of both technologies when translating source information into MNV compliant item
579  data.
580  return nothing
581  '''
582  # Add the common XPath extention functions
583  self.functionDict = {
584  'pubDate': self.pubDate,
585  'getSeasonEpisode': self.getSeasonEpisode,
586  'convertDuration': self.convertDuration,
587  'getHtmlData': self.getHtmlData,
588  'linkWebPage': self.linkWebPage,
589  'baseDir': self.baseDir,
590  'stringLower': self.stringLower,
591  'stringUpper': self.stringUpper,
592  'stringReplace': self.stringReplace,
593  'stringEscape': self.stringEscape,
594  'removePunc': self.removePunc,
595  'htmlToString': self.htmlToString,
596  'checkIfDBItem': self.checkIfDBItem,
597  'getItemElement': self.getItemElement,
598  'getDBRecords': self.getDBRecords,
599  'createItemElement': self.createItemElement,
600  'testSubString': self.testSubString,
601  }
602  # Get the specific source functions
603  self.addDynamicFunctions('xsltfunctions')
604  return
605  # end buildFunctionDict()
606 
607  def addDynamicFunctions(self, dirPath):
608  ''' Dynamically add functions to the function dictionary from a specified directory
609  return nothing
610  '''
611  fullPath = u'%s/nv_python_libs/%s' % (self.baseProcessingDir, dirPath)
612  sys.path.append(fullPath)
613  # Make a list of all functions that need to be included
614  fileList = []
615  for fPath in os.listdir(fullPath):
616  filepath, filename = os.path.split( fPath )
617  filename, ext = os.path.splitext( filename )
618  if filename == u'__init__':
619  continue
620  if ext != '.py':
621  continue
622  fileList.append(filename)
623 
624  # Do not stop when there is an abort on a library just send an error message to stderr
625  for fileName in fileList:
626  filename = {'filename': fileName, }
627  try:
628  exec('''
629 import %(filename)s
630 %(filename)s.common = self
631 for xpathClass in %(filename)s.__xpathClassList__:
632  exec(u'xpathClass = %(filename)s.%%s()' %% xpathClass)
633  for func in xpathClass.functList:
634  exec("self.functionDict['%%s'] = %%s" %% (func, u'xpathClass.%%s' %% func))
635 for xsltExtension in %(filename)s.__xsltExtentionList__:
636  exec("self.functionDict['%%s'] = %%s" %% (xsltExtension, u'%(filename)s.%%s' %% xsltExtension))''' % filename )
637  except Exception, errmsg:
638  sys.stderr.write(u'! Error: Dynamic import of (%s) XPath and XSLT extention functions\nmessage(%s)\n' % (fileName, errmsg))
639 
640  return
641  # end addDynamicFunctions()
642 
643  def pubDate(self, context, *inputArgs):
644  '''Convert a date/time string in a specified format into a pubDate. The default is the
645  MNV item format
646  return the formatted pubDate string
647  return on error return the original date string
648  '''
649  args = []
650  for arg in inputArgs:
651  args.append(arg)
652  if args[0] == u'':
653  return datetime.datetime.now().strftime(self.pubDateFormat)
654  index = args[0].find('+')
655  if index == -1:
656  index = args[0].find('-')
657  if index != -1 and index > 5:
658  args[0] = args[0][:index].strip()
659  args[0] = args[0].replace(',', u'').replace('.', u'')
660  try:
661  if len(args) > 1:
662  args[1] = args[1].replace(',', u'').replace('.', u'')
663  if args[1].find('GMT') != -1:
664  args[1] = args[1][:args[1].find('GMT')].strip()
665  args[0] = args[0][:args[0].rfind(' ')].strip()
666  try:
667  pubdate = time.strptime(args[0], args[1])
668  except ValueError:
669  if args[1] == '%a %d %b %Y %H:%M:%S':
670  pubdate = time.strptime(args[0], '%a %d %B %Y %H:%M:%S')
671  elif args[1] == '%a %d %B %Y %H:%M:%S':
672  pubdate = time.strptime(args[0], '%a %d %b %Y %H:%M:%S')
673  if len(args) > 2:
674  return time.strftime(args[2], pubdate)
675  else:
676  return time.strftime(self.pubDateFormat, pubdate)
677  else:
678  return datetime.datetime.now().strftime(self.pubDateFormat)
679  except Exception, err:
680  sys.stderr.write(u'! Error: pubDate variables(%s) error(%s)\n' % (args, err))
681  return args[0]
682  # end pubDate()
683 
684  def getSeasonEpisode(self, context, text):
685  ''' Check is there is any season or episode number information in an item's text
686  return a string of season and/or episode numbers e.g. "2_21"
687  return a string with "None_None" values
688  '''
689  s_e = [None, None]
690  for regexPattern in self.s_e_Patterns:
691  match = regexPattern.match(text)
692  if not match:
693  continue
694  season_episode = match.groups()
695  if len(season_episode) > 1:
696  s_e[0] = season_episode[0]
697  s_e[1] = season_episode[1]
698  else:
699  s_e[1] = season_episode[0]
700  return u'%s_%s' % (s_e[0], s_e[1])
701  return u'%s_%s' % (s_e[0], s_e[1])
702  # end getSeasonEpisode()
703 
704  def convertDuration(self, context, duration):
705  ''' Take a duration and convert it to seconds
706  return a string of seconds
707  '''
708  min_sec = duration.split(':')
709  seconds = 0
710  for count in range(len(min_sec)):
711  if count != len(min_sec)-1:
712  seconds+=int(min_sec[count])*(60*(len(min_sec)-count-1))
713  else:
714  seconds+=int(min_sec[count])
715  return u'%s' % seconds
716  # end convertDuration()
717 
718  def getHtmlData(self, context, *args):
719  ''' Take a HTML string and convert it to an HTML element. Then apply a filter and return
720  that value.
721  return filter value as a string
722  return an empty sting if the filter failed to find any values.
723  '''
724  xpathFilter = None
725  if len(args) > 1:
726  xpathFilter = args[0]
727  htmldata = args[1]
728  else:
729  htmldata = args[0]
730  htmlElement = etree.HTML(htmldata)
731  if not xpathFilter:
732  return htmlElement
733  filteredData = htmlElement.xpath(xpathFilter)
734  if len(filteredData):
735  if xpathFilter.find('@') != -1:
736  return filteredData[0]
737  else:
738  return filteredData[0].text
739  return u''
740  # end getHtmlData()
741 
742  def linkWebPage(self, context, sourceLink):
743  ''' Check if there is a special local HTML page for the link. If not then return a generic
744  download only local HTML url.
745  return a file://.... link to a local HTML web page
746  '''
747  # Currently there are no link specific Web pages
748  if not self.linksWebPage:
749  self.linksWebPage = etree.parse(u'%s/nv_python_libs/configs/XML/customeHtmlPageList.xml' % (self.baseProcessingDir, ))
750  if self.linksWebPage.find(sourceLink) != None:
751  return u'file://%s/nv_python_libs/configs/HTML/%s' % (self.baseProcessingDir, self.linksWebPage.find(sourceLink).text)
752  return u'file://%s/nv_python_libs/configs/HTML/%s' % (self.baseProcessingDir, 'nodownloads.html')
753  # end linkWebPage()
754 
755  def baseDir(self, context, dummy):
756  ''' Return the base directory string
757  return the base directory
758  '''
759  return self.baseProcessingDir
760  # end baseDir()
761 
762  def stringLower(self, context, data):
763  '''
764  return a lower case string
765  '''
766  if not len(data):
767  return u''
768  return data[0].lower()
769  # end stringLower()
770 
771  def stringUpper(self, context, data):
772  '''
773  return a upper case string
774  '''
775  if not len(data):
776  return u''
777  return data[0].upper()
778  # end stringUpper()
779 
780  def stringReplace(self, context, *inputArgs):
781  ''' Replace substring values in a string
782  return the resulting string from a replace operation
783  '''
784  args = []
785  for arg in inputArgs:
786  args.append(arg)
787  if not len(args) or len(args) == 1:
788  return data
789  if len(args) == 2:
790  args[0] = args[0].replace(args[1], "")
791  else:
792  args[0] = args[0].replace(args[1], args[2])
793  return args[0].strip()
794  # end stringReplace()
795 
796  def stringEscape(self, context, *args):
797  ''' Replace substring values in a string
798  return the resulting string from a replace operation
799  '''
800  if not len(args):
801  return u""
802  if len(args) == 1:
803  return urllib.quote_plus(args[0].encode("utf-8"))
804  else :
805  return urllib.quote_plus(args[0].encode("utf-8"), args[1])
806  # end stringEscape()
807 
808  def removePunc(self, context, data):
809  ''' Remove all punctuation for a string
810  return the resulting string
811  '''
812  if not len(data):
813  return u""
814  return re.sub('[%s]' % re.escape(string.punctuation), '', data)
815  # end removePunc()
816 
817  def htmlToString(self, context, html):
818  ''' Remove HTML tags and LFs from a string
819  return the string without HTML tags or LFs
820  '''
821  if not len(html):
822  return u""
823  return self.massageText(html).strip().replace(u'\n', u' ').replace(u'’', u"&apos;").replace(u'“', u"&apos;")
824  # end htmlToString()
825 
826  def getLanguage(self, context, args):
827  ''' Return the current selected language code
828  return language code
829  '''
830  return self.language
831  # end getLanguage()
832 
833  def checkIfDBItem(self, context, arg):
834  ''' Find an 'internetcontentarticles' table record based on fields and values
835  return True if a record was found and an item element created
836  return False if no record was found
837  '''
838  results = self.getDBRecords('dummy', arg)
839  if len(results):
840  self.itemElement = self.createItemElement('dummy', results[0])
841  return True
842  return False
843  # end checkIfDBItem()
844 
845  def getItemElement(self, context, arg):
846  ''' Return an item element that was created by a previous call to the checkIfDBItem function
847  '''
848  return self.itemElement
849  # end getItemElement()
850 
851  def testSubString(self, context, *arg):
852  ''' Return True or False if a substring is at the beginning or end of a string
853  '''
854  if arg[0] == 'starts':
855  return arg[1].startswith(arg[2])
856  elif arg[0] == 'ends':
857  return arg[1].endswith(arg[2])
858  else:
859  index = arg[1].find(arg[2])
860  if index == -1:
861  return False
862  else:
863  return True
864  # end testSubString()
865 
866  def getDBRecords(self, context, *arg):
867  ''' Return a list of 'internetcontentarticles' table records based on field and value matches
868  '''
869  if not self.mythdb:
870  self.initializeMythDB()
871  self.itemThumbnail = etree.XPath('.//media:thumbnail', namespaces=self.namespaces)
872  self.itemContent = etree.XPath('.//media:content', namespaces=self.namespaces)
873  # Encode the search text to UTF-8
874  for key in arg[0].keys():
875  try:
876  arg[0][key] = arg[0][key].encode('UTF-8')
877  except:
878  return []
879  return list(self.mythdb.searchInternetContent(**arg[0]))
880  # end getDBItem()
881 
882  def createItemElement(self, context, *arg):
883  ''' Create an item element from an 'internetcontentarticles' table record dictionary
884  return the item element
885  '''
886  result = arg[0]
887  itemElement = etree.XML(self.mnvItem)
888  # Insert data into a new item element
889  itemElement.find('link').text = result['url']
890  if result['title']:
891  itemElement.find('title').text = result['title']
892  if result['subtitle']:
893  etree.SubElement(itemElement, "subtitle").text = result['subtitle']
894  if result['description']:
895  itemElement.find('description').text = result['description']
896  if result['author']:
897  itemElement.find('author').text = result['author']
898  if result['date']:
899  itemElement.find('pubDate').text = result['date'].strftime(self.pubDateFormat)
900  if result['rating'] != '32576' and result['rating'][0] != '-':
901  itemElement.find('rating').text = result['rating']
902  if result['thumbnail']:
903  self.itemThumbnail(itemElement)[0].attrib['url'] = result['thumbnail']
904  if result['mediaURL']:
905  self.itemContent(itemElement)[0].attrib['url'] = result['mediaURL']
906  if result['filesize'] > 0:
907  self.itemContent(itemElement)[0].attrib['length'] = unicode(result['filesize'])
908  if result['time'] > 0:
909  self.itemContent(itemElement)[0].attrib['duration'] = unicode(result['time'])
910  if result['width'] > 0:
911  self.itemContent(itemElement)[0].attrib['width'] = unicode(result['width'])
912  if result['height'] > 0:
913  self.itemContent(itemElement)[0].attrib['height'] = unicode(result['height'])
914  if result['language']:
915  self.itemContent(itemElement)[0].attrib['lang'] = result['language']
916  if result['season'] > 0:
917  etree.SubElement(itemElement, "{http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format}season").text = unicode(result['season'])
918  if result['episode'] > 0:
919  etree.SubElement(itemElement, "{http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format}episode").text = unicode(result['episode'])
920  if result['customhtml'] == 1:
921  etree.SubElement(itemElement, "{http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format}customhtml").text = 'true'
922  if result['countries']:
923  countries = result['countries'].split(u' ')
924  for country in countries:
925  etree.SubElement(itemElement, "{http://www.mythtv.org/wiki/MythNetvision_Grabber_Script_Format}country").text = country
926  return itemElement
927  # end createItemElement()
928 
929  def initializeMythDB(self):
930  ''' Import the MythTV database bindings
931  return nothing
932  '''
933  try:
934  from MythTV import MythDB, MythLog, MythError
935  try:
936  '''Create an instance of each: MythDB
937  '''
938  MythLog._setlevel('none') # Some non option -M cannot have any logging on stdout
939  self.mythdb = MythDB()
940  except MythError, e:
941  sys.stderr.write(u'\n! Error - %s\n' % e.args[0])
942  filename = os.path.expanduser("~")+'/.mythtv/config.xml'
943  if not os.path.isfile(filename):
944  sys.stderr.write(u'\n! Error - A correctly configured (%s) file must exist\n' % filename)
945  else:
946  sys.stderr.write(u'\n! Error - Check that (%s) is correctly configured\n' % filename)
947  sys.exit(1)
948  except Exception, e:
949  sys.stderr.write(u"\n! Error - Creating an instance caused an error for one of: MythDB. error(%s)\n" % e)
950  sys.exit(1)
951  except Exception, e:
952  sys.stderr.write(u"\n! Error - MythTV python bindings could not be imported. error(%s)\n" % e)
953  sys.exit(1)
954  # end initializeMythDB()
955 
956 
957 
961 
962 class getURL(Thread):
963  ''' Threaded download of a URL and filter out the desired data for XML and (X)HTML
964  return the filter results
965  '''
966  def __init__ (self, urlKey, debug):
967  Thread.__init__(self)
968  self.urlKey = urlKey
969  self.debug = debug
970 
971  def run(self):
972  if self.debug:
973  print u"getURL href(%s)" % (self.urlDictionary[self.urlKey]['href'], )
974  print
975 
976  # Input the data from a url
977  try:
978  self.urlDictionary[self.urlKey]['tree'] = etree.parse(self.urlDictionary[self.urlKey]['href'], self.urlDictionary[self.urlKey]['parser'])
979  except Exception, errormsg:
980  sys.stderr.write(u"! Error: The URL (%s) cause the exception error (%s)\n" % (self.urlDictionary[self.urlKey]['href'], errormsg))
981  return
982 
983  if self.debug:
984  print "Raw unfiltered URL input:"
985  sys.stdout.write(etree.tostring(self.urlDictionary[self.urlKey]['tree'], encoding='UTF-8', pretty_print=True))
986  print
987 
988  if len(self.urlDictionary[self.urlKey]['filter']):
989  for index in range(len(self.urlDictionary[self.urlKey]['filter'])):
990  # Filter out the desired data
991  try:
992  self.urlDictionary[self.urlKey]['tmp'] = self.urlDictionary[self.urlKey]['tree'].xpath(self.urlDictionary[self.urlKey]['filter'][index], namespaces=self.urlDictionary[self.urlKey]['namespaces'])
993  except AssertionError, e:
994  sys.stderr.write(u"No filter results for Name(%s)\n" % self.urlKey)
995  sys.stderr.write(u"No filter results for url(%s)\n" % self.urlDictionary[self.urlKey]['href'])
996  sys.stderr.write(u"! Error:(%s)\n" % e)
997  if len(self.urlDictionary[self.urlKey]['filter']) == index-1:
998  return
999  else:
1000  continue
1001  self.urlDictionary[self.urlKey]['result'].append(self.urlDictionary[self.urlKey]['tmp'])
1002  elif len(self.urlDictionary[self.urlKey]['xslt']):
1003  for index in range(len(self.urlDictionary[self.urlKey]['xslt'])):
1004  # Process the results through a XSLT stylesheet out the desired data
1005  try:
1006  if self.urlDictionary[self.urlKey].has_key('parameter'):
1007  self.urlDictionary[self.urlKey]['tmp'] = self.urlDictionary[self.urlKey]['xslt'][index](self.urlDictionary[self.urlKey]['tree'], paraMeter= etree.XSLT.strparam(
1008 self.urlDictionary[self.urlKey]['parameter']) )
1009  else:
1010  self.urlDictionary[self.urlKey]['tmp'] = self.urlDictionary[self.urlKey]['xslt'][index](self.urlDictionary[self.urlKey]['tree'])
1011  except Exception, e:
1012  sys.stderr.write(u"! XSLT Error:(%s) Key(%s)\n" % (e, self.urlKey))
1013  if len(self.urlDictionary[self.urlKey]['filter']) == index-1:
1014  return
1015  else:
1016  continue
1017  # Was any data found?
1018  if self.urlDictionary[self.urlKey]['tmp'].getroot() == None:
1019  sys.stderr.write(u"No Xslt results for Name(%s)\n" % self.urlKey)
1020  sys.stderr.write(u"No Xslt results for url(%s)\n" % self.urlDictionary[self.urlKey]['href'])
1021  if len(self.urlDictionary[self.urlKey]['filter']) == index-1:
1022  return
1023  else:
1024  continue
1025  self.urlDictionary[self.urlKey]['result'].append(self.urlDictionary[self.urlKey]['tmp'])
1026  else:
1027  # Just pass back the raw data
1028  self.urlDictionary[self.urlKey]['result'] = [self.urlDictionary[self.urlKey]['tree']]
1029 
1030  # Check whether there are more pages available
1031  if self.urlDictionary[self.urlKey]['pageFilter']:
1032  if len(self.urlDictionary[self.urlKey]['tree'].xpath(self.urlDictionary[self.urlKey]['pageFilter'], namespaces=self.urlDictionary[self.urlKey]['namespaces'])):
1033  self.urlDictionary[self.urlKey]['morePages'] = 'true'
1034  return
1035  # end run()
1036 # end class getURL()
1037 
1038 
def baseDir(self, context, dummy)
Definition: common_api.py:755
Definition: mythdb.h:14
def stringUpper(self, context, data)
Definition: common_api.py:771
def __init__(self, urlKey, debug)
Definition: common_api.py:966
static void error(const char *str,...)
Definition: vbi.c:42
static pid_list_t::iterator find(const PIDInfoMap &map, pid_list_t &list, pid_list_t::iterator begin, pid_list_t::iterator end, bool find_open)
def getHtmlData(self, context, *args)
Definition: common_api.py:718
def __init__(self, logger=False, debug=False)
Definition: common_api.py:135
def getSeasonEpisode(self, context, text)
Definition: common_api.py:684
def linkWebPage(self, context, sourceLink)
Definition: common_api.py:742
def pubDate(self, context, *inputArgs)
Definition: common_api.py:643
long long copy(QFile &dst, QFile &src, uint block_size)
Copies src file to dst file.
def stringEscape(self, context, *args)
Definition: common_api.py:796
def getUrlData(self, inputUrls, pageFilter=None)
Definition: common_api.py:475
def stringLower(self, context, data)
Definition: common_api.py:762
def callCommandLine(self, command, stderr=False)
Definition: common_api.py:301
def stringReplace(self, context, *inputArgs)
Definition: common_api.py:780
def buildFunctionDict(self)
Start - Utility functions specifically used to modify MNV item data.
Definition: common_api.py:572
Start - Utility functions.
Definition: common_api.py:132
def initLogger(self, path=sys.stderr, log_name=u 'MNV_Grabber')
Definition: common_api.py:255
def removePunc(self, context, data)
Definition: common_api.py:808
def convertDuration(self, context, duration)
Definition: common_api.py:704
def checkIfDBItem(self, context, arg)
Definition: common_api.py:833
def getLanguage(self, context, args)
Definition: common_api.py:826
def __init__(self, outstream, encoding=None)
Definition: common_api.py:65
def createItemElement(self, context, *arg)
Definition: common_api.py:882
def getItemElement(self, context, arg)
Definition: common_api.py:845
def mnvChannelElement(self, channelDetails)
Definition: common_api.py:437
def getDBRecords(self, context, *arg)
Definition: common_api.py:866
def testSubString(self, context, *arg)
Definition: common_api.py:851
def htmlToString(self, context, html)
Definition: common_api.py:817