diff options
Diffstat (limited to 'community-testing/freevo/imdb-html5lib.patch')
-rw-r--r-- | community-testing/freevo/imdb-html5lib.patch | 301 |
1 files changed, 301 insertions, 0 deletions
diff --git a/community-testing/freevo/imdb-html5lib.patch b/community-testing/freevo/imdb-html5lib.patch new file mode 100644 index 000000000..6f454246e --- /dev/null +++ b/community-testing/freevo/imdb-html5lib.patch @@ -0,0 +1,301 @@ +Index: src/helpers/imdb.py +=================================================================== +--- src/helpers/imdb.py (revision 11608) ++++ src/helpers/imdb.py (working copy) +@@ -53,19 +53,19 @@ + parser = OptionParser(version='%prog 1.0', conflict_handler='resolve', usage=""" + Search IMDB for a movie or a TV show + +-freevo imdb [options] <search> [<output> <video file> [<video file>]] ++freevo imdb [options] | [<result> <fxd file> <video file> [<video file>]] + +-Generate <output>.fxd for the movie. Files is a list of files that belongs to +-this movie. Use [dvd|vcd] to add the whole disc or use [dvd|vcd][title] to add +-a special DVD or VCD title to the list of files""") ++Generate a fxd for the movie. Files is a list of files that belongs to this ++movie. Use [dvd|vcd] to add the whole disc or use [dvd|vcd][title] to add a ++special DVD or VCD title to the list of files""") + parser.add_option('-v', '--verbose', action='count', default=0, + help='set the level of verbosity [default:%default]') + parser.add_option('-s', '--search', action='store_true', dest='search', default=False, + help='search imdb for string [default:%default]') + parser.add_option('-g', '--guess', action='store_true', dest='guess', default=False, + help='search imdb for possible filename match [default:%default]') +- parser.add_option('--tv', action='store_true', dest='tv', default=False, +- help='specify the search is a tv programme [default:%default]') ++ parser.add_option('--tv', action='store', dest='tv', default=None, ++ help='specify the id of a tv programme for a eipsode search [default:%default]') + parser.add_option('--season', dest='season', default=None, + help='specify the season in the search [default:%default]') + parser.add_option('--episode', dest='episode', default=None, +@@ -116,7 +116,9 @@ + sys.exit(u'--search requires <search pattern>') + elif opts.guess and len(args) < 1: + sys.exit(u'--guess requires <guess pattern>') +- tv_marker = (opts.tv or opts.season or opts.episode) and '"' or '' ++ #elif opts.tv and len(args) < 1: ++ # sys.exit(u'--tv requires <imdb id>') ++ tv_marker = (opts.season or opts.episode) and '"' or '' + + if opts.rom_drive is not None: + driveset = True +@@ -176,6 +178,23 @@ + print '%s' % title.encode(opts.encoding) + sys.exit(0) + ++ if opts.tv: ++ print "Searching IMDB for '%s' season:%s episode:%s..." % (opts.tv, opts.season, opts.episode) ++ results = fxd.getIMDBid(opts.tv, opts.season, opts.episode) ++ if len(results) == 0: ++ print 'No results' ++ #for result in results: ++ # if result[3]: ++ # title = 'http://www.imdb.com/title/tt%s/ %s %s (%s) %s' % (result[:1] + result[:4]) ++ # elif result[2]: ++ # title = 'http://www.imdb.com/title/tt%s/ %s %s (%s)' % (result[:1] + result[:3]) ++ # else: ++ # title = 'http://www.imdb.com/title/tt%s/ %s %s' % (result[:1] + result[:2]) ++ # title = results ++ title = 'http://www.imdb.com/title/tt%s/ %s' % (results, results) ++ print '%s' % title.encode(opts.encoding) ++ sys.exit(0) ++ + # normal usage + if len(args) < 3: + sys.exit(u'requires <imdb id> <fxd filename> <video file>|<cd id>') +Index: src/util/fxdimdb.py +=================================================================== +--- src/util/fxdimdb.py (revision 11608) ++++ src/util/fxdimdb.py (working copy) +@@ -48,8 +48,15 @@ + import codecs + import os + import traceback +-from BeautifulSoup import BeautifulSoup, NavigableString +-import HTMLParser ++from pprint import pprint, pformat ++try: ++ from html5lib import HTMLParser, treebuilders ++ from html5lib.treebuilders.soup import NavigableString ++ using_html5lib = True ++except ImportError: ++ import HTMLParser ++ from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString ++ using_html5lib = False + + import config + import util +@@ -232,6 +239,7 @@ + response.close() + + _debug_('id_list has %s items' % (len(self.id_list))) ++ #print 'id_list=%s' % (pformat(self.id_list)) + if len(self.id_list) > 20: + # too many results, check if there are stupid results in the list + words = [] +@@ -311,7 +319,11 @@ + dvd = 0 + + try: +- soup = BeautifulSoup(results.read(), convertEntities='xml') ++ if using_html5lib: ++ parser = HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup')) ++ soup = parser.parse(results.read()) ++ else: ++ soup = BeautifulSoup(results.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) + except UnicodeDecodeError: + print "Unicode error: check that /usr/lib/python2.x/site.py has the correct default encoding" + traceback.print_exc() +@@ -336,68 +348,61 @@ + self.info['year'] = y[1:-1] + except (AttributeError, TypeError, ValueError): + self.info['title'] = self.title +- self.info['year'] = title.find('a').string.strip() ++ try: ++ self.info['year'] = title.find('a').contents[0].strip() ++ except AttributeError: ++ self.info['year'] = '' + + # Find the <div> with class info, each <h5> under this provides info ++ wanted_keys = ('release_date', 'genre', 'tagline', 'plot', 'plot_keywords', ++ 'also_known_as', 'mpaa', 'runtime', 'country', 'language', ++ 'color', 'aspect_ratio', 'sound_mix', 'certification', ++ ) ++ + for info in main.findAll('div', {'class' : 'info'}): + infoh5 = info.find('h5') + if not infoh5: + continue + try: +- infostr = infoh5.next +- key = infostr.string.strip(':').lower().replace(' ', '_') +- nextsibling = nextsibling = infoh5.nextSibling.strip() +- sections = info.findAll('a', { 'href' : re.compile('/Sections') }) +- lists = info.findAll('a', { 'href' : re.compile('/List') }) +- if len(nextsibling) > 0: +- self.info[key] = nextsibling ++ infostr = infoh5.find(text=True) ++ key = infostr.strip().strip(':').lower().replace(' ', '_') ++ if key not in wanted_keys: ++ continue ++ content = info.find('div', {'class' : 'info-content'}) ++ infocontent = content.find(text=True) ++ if infocontent: ++ infocontent = infocontent.strip() ++ sections = info.findAll('a', { 'href' : re.compile('^/Sections') }) ++ lists = info.findAll('a', { 'href' : re.compile('^/List') }) ++ keywords = info.findAll('a', { 'href' : re.compile('^/keyword') }) ++ #print 'key=%s content=%r keywords=%r sections=%r lists=%r' % (key, infocontent, keywords, sections, lists) ++ if len(infocontent) > 0: ++ self.info[key] = infocontent + elif len(sections) > 0: + items = [] + for item in sections: +- items.append(item.string) ++ items.append(item.contents[0].strip()) + self.info[key] = ' / '.join(items) + elif len(lists) > 0: + items = [] + for item in lists: +- items.append(item.string) ++ items.append(item.contents[0].strip()) + self.info[key] = ' / '.join(items) ++ elif len(keywords) > 0: ++ items = [] ++ for item in keywords: ++ items.append(item.contents[0].strip()) ++ self.info[key] = ' / '.join(items) + except: + pass + +- # Find Plot Outline/Summary: +- # Normally the tag is named "Plot Outline:" - however sometimes +- # the tag is "Plot Summary:" or just "Plot:". Search for all strings. +- imdb_result = soup.find(text='Plot Outline:') +- if not imdb_result: +- imdb_result = soup.find(text='Plot Summary:') +- if not imdb_result: +- imdb_result = soup.find(text='Plot:') +- if imdb_result: +- self.info['plot'] = imdb_result.next.strip() +- else: +- self.info['plot'] = u'' +- +- # Find tagline - sometimes the tagline is missing. +- # Use an empty string if no tagline could be found. +- imdb_result = soup.find(text='Tagline:') +- if imdb_result: +- self.info['tagline'] = imdb_result.next.strip() +- else: +- self.info['tagline'] = u'' +- + rating = soup.find(text='User Rating:').findNext(text=re.compile('/10')) +- if rating: ++ try: + votes = rating.findNext('a') +- self.info['rating'] = rating.strip() + ' (' + votes.string.strip() + ')' +- else: ++ self.info['rating'] = rating.strip() + ' (' + votes.contents[0].strip() + ')' ++ except AttributeError: + self.info['rating'] = '' + +- runtime = soup.find(text='Runtime:') +- if runtime and runtime.next: +- self.info['runtime'] = runtime.next.strip() +- else: +- self.info['runtime'] = '' +- + # Replace special characters in the items + for (k,v) in self.info.items(): + self.info[k] = self.convert_entities(v) +@@ -794,10 +799,14 @@ + _debug_('parsesearchdata(results=%r, url=%r, id=%r)' % (results, url, id)) + + self.id_list = [] +- m = re.compile('/title/tt([0-9]*)/') +- y = re.compile('\(([^)]+)\)') ++ m = re.compile('/title/tt(\d+)/') ++ y = re.compile('\((\d+)\) *(.*)') + try: +- soup = BeautifulSoup(results.read(), convertEntities='xml') ++ if using_html5lib: ++ parser = HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup')) ++ soup = parser.parse(results.read()) ++ else: ++ soup = BeautifulSoup(results.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) + except HTMLParser.HTMLParseError, why: + traceback.print_exc() + _debug_('Cannot parse %r: %s' % (url, why), DWARNING) +@@ -806,28 +815,37 @@ + traceback.print_exc() + _debug_('Cannot parse %r: %s' % (url, why), DWARNING) + return self.id_list +- items = soup.findAll('a', href=re.compile('/title/tt')) ++ items = soup.findAll('a', href=re.compile('^/title/tt')) + ids = set([]) + for item in items: +- idm = m.search(item['href']) ++ idm = item.attrMap['href'] + if not idm: + continue +- if isinstance(item.next.next, NavigableString): +- yrm = y.findall(item.next.next) +- +- id = idm.group(1) +- name = item.string +- # skip empty names +- if not name: ++ m_match = m.match(idm) ++ if not m_match: ++ # skip invalid titles + continue +- # skip duplicate ids ++ id = m_match.group(1) + if id in ids: ++ # skip duplicate ids + continue ++ name = item.contents[0] ++ if not isinstance(name, NavigableString): ++ # skip empty names ++ continue ++ if isinstance(item.next.next, NavigableString): ++ yrm = item.next.next.strip() + ids.add(id) +- year = len(yrm) > 0 and yrm[0] or '0000' +- type = len(yrm) > 1 and yrm[1] or '' ++ y_match = y.match(yrm) ++ if y_match: ++ year = y_match.group(1) ++ type = y_match.group(2) ++ else: ++ year = '0000' ++ type = '' + #print 'url', item['href'] + #print item.parent.findChildren(text=re.compile('[^ ]')) ++ #print 'id=%s name=%s year=%s type=%s' % (id, name, year, type) + self.id_list += [ ( id, name, year, type ) ] + + for item in self.id_list: +@@ -840,7 +858,11 @@ + Returns a new id for getIMDBid with TV series episode data + """ + try: +- soup = BeautifulSoup(results.read(), convertEntities='xml') ++ if using_html5lib: ++ parser = HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup')) ++ soup = parser.parse(results.read()) ++ else: ++ soup = BeautifulSoup(results.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) + except UnicodeDecodeError: + print "Unicode error; check that /usr/lib/python2.x/site.py has the correct default encoding" + pass +@@ -968,9 +990,6 @@ + self.image = vfs.basename(self.image) + + _debug_('Downloaded cover image from %s' % (self.image_url)) +- print "Freevo knows nothing about the copyright of this image, please" +- print "go to %s to check for more information about private." % self.image_url +- print "use of this image" + + + def str2XML(self, line): |