summaryrefslogtreecommitdiff
path: root/community-testing/freevo/imdb-html5lib.patch
diff options
context:
space:
mode:
Diffstat (limited to 'community-testing/freevo/imdb-html5lib.patch')
-rw-r--r--community-testing/freevo/imdb-html5lib.patch301
1 files changed, 301 insertions, 0 deletions
diff --git a/community-testing/freevo/imdb-html5lib.patch b/community-testing/freevo/imdb-html5lib.patch
new file mode 100644
index 000000000..6f454246e
--- /dev/null
+++ b/community-testing/freevo/imdb-html5lib.patch
@@ -0,0 +1,301 @@
+Index: src/helpers/imdb.py
+===================================================================
+--- src/helpers/imdb.py (revision 11608)
++++ src/helpers/imdb.py (working copy)
+@@ -53,19 +53,19 @@
+ parser = OptionParser(version='%prog 1.0', conflict_handler='resolve', usage="""
+ Search IMDB for a movie or a TV show
+
+-freevo imdb [options] <search> [<output> <video file> [<video file>]]
++freevo imdb [options] | [<result> <fxd file> <video file> [<video file>]]
+
+-Generate <output>.fxd for the movie. Files is a list of files that belongs to
+-this movie. Use [dvd|vcd] to add the whole disc or use [dvd|vcd][title] to add
+-a special DVD or VCD title to the list of files""")
++Generate a fxd for the movie. Files is a list of files that belongs to this
++movie. Use [dvd|vcd] to add the whole disc or use [dvd|vcd][title] to add a
++special DVD or VCD title to the list of files""")
+ parser.add_option('-v', '--verbose', action='count', default=0,
+ help='set the level of verbosity [default:%default]')
+ parser.add_option('-s', '--search', action='store_true', dest='search', default=False,
+ help='search imdb for string [default:%default]')
+ parser.add_option('-g', '--guess', action='store_true', dest='guess', default=False,
+ help='search imdb for possible filename match [default:%default]')
+- parser.add_option('--tv', action='store_true', dest='tv', default=False,
+- help='specify the search is a tv programme [default:%default]')
++ parser.add_option('--tv', action='store', dest='tv', default=None,
++ help='specify the id of a tv programme for a eipsode search [default:%default]')
+ parser.add_option('--season', dest='season', default=None,
+ help='specify the season in the search [default:%default]')
+ parser.add_option('--episode', dest='episode', default=None,
+@@ -116,7 +116,9 @@
+ sys.exit(u'--search requires <search pattern>')
+ elif opts.guess and len(args) < 1:
+ sys.exit(u'--guess requires <guess pattern>')
+- tv_marker = (opts.tv or opts.season or opts.episode) and '"' or ''
++ #elif opts.tv and len(args) < 1:
++ # sys.exit(u'--tv requires <imdb id>')
++ tv_marker = (opts.season or opts.episode) and '"' or ''
+
+ if opts.rom_drive is not None:
+ driveset = True
+@@ -176,6 +178,23 @@
+ print '%s' % title.encode(opts.encoding)
+ sys.exit(0)
+
++ if opts.tv:
++ print "Searching IMDB for '%s' season:%s episode:%s..." % (opts.tv, opts.season, opts.episode)
++ results = fxd.getIMDBid(opts.tv, opts.season, opts.episode)
++ if len(results) == 0:
++ print 'No results'
++ #for result in results:
++ # if result[3]:
++ # title = 'http://www.imdb.com/title/tt%s/ %s %s (%s) %s' % (result[:1] + result[:4])
++ # elif result[2]:
++ # title = 'http://www.imdb.com/title/tt%s/ %s %s (%s)' % (result[:1] + result[:3])
++ # else:
++ # title = 'http://www.imdb.com/title/tt%s/ %s %s' % (result[:1] + result[:2])
++ # title = results
++ title = 'http://www.imdb.com/title/tt%s/ %s' % (results, results)
++ print '%s' % title.encode(opts.encoding)
++ sys.exit(0)
++
+ # normal usage
+ if len(args) < 3:
+ sys.exit(u'requires <imdb id> <fxd filename> <video file>|<cd id>')
+Index: src/util/fxdimdb.py
+===================================================================
+--- src/util/fxdimdb.py (revision 11608)
++++ src/util/fxdimdb.py (working copy)
+@@ -48,8 +48,15 @@
+ import codecs
+ import os
+ import traceback
+-from BeautifulSoup import BeautifulSoup, NavigableString
+-import HTMLParser
++from pprint import pprint, pformat
++try:
++ from html5lib import HTMLParser, treebuilders
++ from html5lib.treebuilders.soup import NavigableString
++ using_html5lib = True
++except ImportError:
++ import HTMLParser
++ from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString
++ using_html5lib = False
+
+ import config
+ import util
+@@ -232,6 +239,7 @@
+ response.close()
+
+ _debug_('id_list has %s items' % (len(self.id_list)))
++ #print 'id_list=%s' % (pformat(self.id_list))
+ if len(self.id_list) > 20:
+ # too many results, check if there are stupid results in the list
+ words = []
+@@ -311,7 +319,11 @@
+ dvd = 0
+
+ try:
+- soup = BeautifulSoup(results.read(), convertEntities='xml')
++ if using_html5lib:
++ parser = HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
++ soup = parser.parse(results.read())
++ else:
++ soup = BeautifulSoup(results.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
+ except UnicodeDecodeError:
+ print "Unicode error: check that /usr/lib/python2.x/site.py has the correct default encoding"
+ traceback.print_exc()
+@@ -336,68 +348,61 @@
+ self.info['year'] = y[1:-1]
+ except (AttributeError, TypeError, ValueError):
+ self.info['title'] = self.title
+- self.info['year'] = title.find('a').string.strip()
++ try:
++ self.info['year'] = title.find('a').contents[0].strip()
++ except AttributeError:
++ self.info['year'] = ''
+
+ # Find the <div> with class info, each <h5> under this provides info
++ wanted_keys = ('release_date', 'genre', 'tagline', 'plot', 'plot_keywords',
++ 'also_known_as', 'mpaa', 'runtime', 'country', 'language',
++ 'color', 'aspect_ratio', 'sound_mix', 'certification',
++ )
++
+ for info in main.findAll('div', {'class' : 'info'}):
+ infoh5 = info.find('h5')
+ if not infoh5:
+ continue
+ try:
+- infostr = infoh5.next
+- key = infostr.string.strip(':').lower().replace(' ', '_')
+- nextsibling = nextsibling = infoh5.nextSibling.strip()
+- sections = info.findAll('a', { 'href' : re.compile('/Sections') })
+- lists = info.findAll('a', { 'href' : re.compile('/List') })
+- if len(nextsibling) > 0:
+- self.info[key] = nextsibling
++ infostr = infoh5.find(text=True)
++ key = infostr.strip().strip(':').lower().replace(' ', '_')
++ if key not in wanted_keys:
++ continue
++ content = info.find('div', {'class' : 'info-content'})
++ infocontent = content.find(text=True)
++ if infocontent:
++ infocontent = infocontent.strip()
++ sections = info.findAll('a', { 'href' : re.compile('^/Sections') })
++ lists = info.findAll('a', { 'href' : re.compile('^/List') })
++ keywords = info.findAll('a', { 'href' : re.compile('^/keyword') })
++ #print 'key=%s content=%r keywords=%r sections=%r lists=%r' % (key, infocontent, keywords, sections, lists)
++ if len(infocontent) > 0:
++ self.info[key] = infocontent
+ elif len(sections) > 0:
+ items = []
+ for item in sections:
+- items.append(item.string)
++ items.append(item.contents[0].strip())
+ self.info[key] = ' / '.join(items)
+ elif len(lists) > 0:
+ items = []
+ for item in lists:
+- items.append(item.string)
++ items.append(item.contents[0].strip())
+ self.info[key] = ' / '.join(items)
++ elif len(keywords) > 0:
++ items = []
++ for item in keywords:
++ items.append(item.contents[0].strip())
++ self.info[key] = ' / '.join(items)
+ except:
+ pass
+
+- # Find Plot Outline/Summary:
+- # Normally the tag is named "Plot Outline:" - however sometimes
+- # the tag is "Plot Summary:" or just "Plot:". Search for all strings.
+- imdb_result = soup.find(text='Plot Outline:')
+- if not imdb_result:
+- imdb_result = soup.find(text='Plot Summary:')
+- if not imdb_result:
+- imdb_result = soup.find(text='Plot:')
+- if imdb_result:
+- self.info['plot'] = imdb_result.next.strip()
+- else:
+- self.info['plot'] = u''
+-
+- # Find tagline - sometimes the tagline is missing.
+- # Use an empty string if no tagline could be found.
+- imdb_result = soup.find(text='Tagline:')
+- if imdb_result:
+- self.info['tagline'] = imdb_result.next.strip()
+- else:
+- self.info['tagline'] = u''
+-
+ rating = soup.find(text='User Rating:').findNext(text=re.compile('/10'))
+- if rating:
++ try:
+ votes = rating.findNext('a')
+- self.info['rating'] = rating.strip() + ' (' + votes.string.strip() + ')'
+- else:
++ self.info['rating'] = rating.strip() + ' (' + votes.contents[0].strip() + ')'
++ except AttributeError:
+ self.info['rating'] = ''
+
+- runtime = soup.find(text='Runtime:')
+- if runtime and runtime.next:
+- self.info['runtime'] = runtime.next.strip()
+- else:
+- self.info['runtime'] = ''
+-
+ # Replace special characters in the items
+ for (k,v) in self.info.items():
+ self.info[k] = self.convert_entities(v)
+@@ -794,10 +799,14 @@
+ _debug_('parsesearchdata(results=%r, url=%r, id=%r)' % (results, url, id))
+
+ self.id_list = []
+- m = re.compile('/title/tt([0-9]*)/')
+- y = re.compile('\(([^)]+)\)')
++ m = re.compile('/title/tt(\d+)/')
++ y = re.compile('\((\d+)\) *(.*)')
+ try:
+- soup = BeautifulSoup(results.read(), convertEntities='xml')
++ if using_html5lib:
++ parser = HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
++ soup = parser.parse(results.read())
++ else:
++ soup = BeautifulSoup(results.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
+ except HTMLParser.HTMLParseError, why:
+ traceback.print_exc()
+ _debug_('Cannot parse %r: %s' % (url, why), DWARNING)
+@@ -806,28 +815,37 @@
+ traceback.print_exc()
+ _debug_('Cannot parse %r: %s' % (url, why), DWARNING)
+ return self.id_list
+- items = soup.findAll('a', href=re.compile('/title/tt'))
++ items = soup.findAll('a', href=re.compile('^/title/tt'))
+ ids = set([])
+ for item in items:
+- idm = m.search(item['href'])
++ idm = item.attrMap['href']
+ if not idm:
+ continue
+- if isinstance(item.next.next, NavigableString):
+- yrm = y.findall(item.next.next)
+-
+- id = idm.group(1)
+- name = item.string
+- # skip empty names
+- if not name:
++ m_match = m.match(idm)
++ if not m_match:
++ # skip invalid titles
+ continue
+- # skip duplicate ids
++ id = m_match.group(1)
+ if id in ids:
++ # skip duplicate ids
+ continue
++ name = item.contents[0]
++ if not isinstance(name, NavigableString):
++ # skip empty names
++ continue
++ if isinstance(item.next.next, NavigableString):
++ yrm = item.next.next.strip()
+ ids.add(id)
+- year = len(yrm) > 0 and yrm[0] or '0000'
+- type = len(yrm) > 1 and yrm[1] or ''
++ y_match = y.match(yrm)
++ if y_match:
++ year = y_match.group(1)
++ type = y_match.group(2)
++ else:
++ year = '0000'
++ type = ''
+ #print 'url', item['href']
+ #print item.parent.findChildren(text=re.compile('[^ ]'))
++ #print 'id=%s name=%s year=%s type=%s' % (id, name, year, type)
+ self.id_list += [ ( id, name, year, type ) ]
+
+ for item in self.id_list:
+@@ -840,7 +858,11 @@
+ Returns a new id for getIMDBid with TV series episode data
+ """
+ try:
+- soup = BeautifulSoup(results.read(), convertEntities='xml')
++ if using_html5lib:
++ parser = HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
++ soup = parser.parse(results.read())
++ else:
++ soup = BeautifulSoup(results.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
+ except UnicodeDecodeError:
+ print "Unicode error; check that /usr/lib/python2.x/site.py has the correct default encoding"
+ pass
+@@ -968,9 +990,6 @@
+ self.image = vfs.basename(self.image)
+
+ _debug_('Downloaded cover image from %s' % (self.image_url))
+- print "Freevo knows nothing about the copyright of this image, please"
+- print "go to %s to check for more information about private." % self.image_url
+- print "use of this image"
+
+
+ def str2XML(self, line):