diff options
Diffstat (limited to 'extensions/TimedMediaHandler/MwEmbedModules/TimedText/resources/mw.TextSource.js')
-rw-r--r-- | extensions/TimedMediaHandler/MwEmbedModules/TimedText/resources/mw.TextSource.js | 504 |
1 files changed, 504 insertions, 0 deletions
diff --git a/extensions/TimedMediaHandler/MwEmbedModules/TimedText/resources/mw.TextSource.js b/extensions/TimedMediaHandler/MwEmbedModules/TimedText/resources/mw.TextSource.js new file mode 100644 index 00000000..cce8310f --- /dev/null +++ b/extensions/TimedMediaHandler/MwEmbedModules/TimedText/resources/mw.TextSource.js @@ -0,0 +1,504 @@ +/** + * Base mw.TextSource object + * + * @param {Object} source Source object to extend + * @param {Object} textProvider [Optional] The text provider interface ( to load source from api ) + */ +( function( mw, $ ) { "use strict"; + + mw.TextSource = function( source ) { + return this.init( source ); + }; + mw.TextSource.prototype = { + + //The load state: + loaded: false, + + // Container for the captions + // captions include "start", "end" and "content" fields + captions: [], + + // The css style for captions ( some file formats specify display types ) + styleCss: {}, + + // The previous index of the timed text served + // Avoids searching the entire array on time updates. + prevIndex: 0, + + /** + * @constructor Inherits mediaSource from embedPlayer + * @param {source} Base source element + * @param {Object} Pointer to the textProvider + */ + init: function( source , textProvider) { + // Inherits mediaSource + for( var i in source){ + this[ i ] = source[ i ]; + } + + // Set default category to subtitle if unset: + if( ! this.kind ) { + this.kind = 'subtitle'; + } + //Set the textProvider if provided + if( textProvider ) { + this.textProvider = textProvider; + } + return this; + }, + + /** + * Function to load and parse the source text + * @param {Function} callback Function called once text source is loaded + */ + load: function( callback ) { + var _this = this; + mw.log("TextSource:: load src "+ _this.getSrc() ); + + // Setup up a callback ( in case it was not defined ) + if( !callback ){ + callback = function(){ return ; }; + } + + // Check if the captions have already been loaded: + if( this.loaded ){ + return callback(); + } + + // Try to load src via XHR source + if( !this.getSrc() ) { + mw.log( "Error: TextSource no source url for text track"); + return callback(); + } + + // Check type for special loaders: + $( mw ).triggerQueueCallback( 'TimedText_LoadTextSource', _this, function(){ + if( _this.loaded ){ + callback(); + } else { + // if no module loaded the text source use the normal ajax proxy: + new mw.ajaxProxy({ + url: _this.getSrc(), + success: function( resultXML ) { + _this.captions = _this.getCaptions( resultXML ); + _this.loaded = true; + mw.log("mw.TextSource :: loaded from " + _this.getSrc() + " Found: " + _this.captions.length + ' captions' ); + callback(); + }, + error: function() { + mw.log("Error: TextSource Error with http response"); + _this.loaded = true; + callback(); + } + }); + } + }) + }, + /** + * Returns the text content for requested time + * + * @param {Number} time Time in seconds + */ + getCaptionForTime: function ( time ) { + var prevCaption = this.captions[ this.prevIndex ]; + var captionSet = {}; + + // Setup the startIndex: + if( prevCaption && time >= prevCaption.start ) { + var startIndex = this.prevIndex; + }else{ + // If a backwards seek start searching at the start: + var startIndex = 0; + } + var firstCapIndex = 0; + // Start looking for the text via time, add all matches that are in range + for( var i = startIndex ; i < this.captions.length; i++ ) { + var caption = this.captions[ i ]; + // Don't handle captions with 0 or -1 end time: + if( caption.end == 0 || caption.end == -1) + continue; + + if( time >= caption.start && + time <= caption.end ) { + // set the earliest valid time to the current start index: + if( !firstCapIndex ){ + firstCapIndex = caption.start; + } + + //mw.log("Start cap time: " + caption.start + ' End time: ' + caption.end ); + captionSet[i] = caption ; + } + // captions are stored in start order stop search if we get larger than time + if( caption.start > time ){ + break; + } + } + // Update the prevIndex: + this.prevIndex = firstCapIndex; + //Return the set of captions in range: + return captionSet; + }, + + /** + * Check if the caption is an overlay format ( and must be ontop of the player ) + */ + isOverlay: function(){ + return this.mimeType == 'text/xml'; + }, + + getCaptions: function( data ){ + // Detect caption data type: + switch( this.mimeType ){ + case 'text/mw-srt': + return this.getCaptiosnFromMediaWikiSrt( data ); + break; + case 'text/x-srt': + return this.getCaptionsFromSrt( data); + break; + case 'text/xml': + return this.getCaptionsFromTMML( data ); + break; + } + // caption mime not found return empty set: + return []; + }, + + getStyleCssById: function( styleId ){ + if( this.styleCss[ styleId ] ){ + return this.styleCss[ styleId ]; + } + return {}; + }, + /** + * Grab timed text from TMML format + * + * @param data + * @return + */ + getCaptionsFromTMML: function( data ){ + var _this = this; + mw.log("TextSource::getCaptionsFromTMML", data); + // set up display information: + var captions = []; + var xml = ( $( data ).find("tt").length ) ? data : $.parseXML( data ); + + // Check for parse error: + try { + if( !xml || $( xml ).find('parsererror').length ){ + mw.log("Error: close caption parse error: " + $( xml ).find('parsererror').text() ); + return captions; + } + } catch ( e ) { + mw.log( "Error: close caption parse error: " + e.toString() ); + return captions; + } + + // Set the body Style + var bodyStyleId = $( xml ).find('body').attr('style'); + + // Set style translate ttml to css + $( xml ).find( 'style').each( function( inx, style){ + var cssObject = {}; + // Map CamelCase css properties: + $( style.attributes ).each(function(inx, attr){ + var attrName = attr.name; + if( attrName.substr(0, 4) !== 'tts:' ){ + // skip + return true; + } + var cssName = ''; + for( var c = 4; c < attrName.length; c++){ + if( attrName[c].toLowerCase() != attrName[c] ){ + cssName += '-' + attrName[c].toLowerCase(); + } else { + cssName+= attrName[c] + } + } + cssObject[ cssName ] = attr.nodeValue; + }); + // for(var i =0; i< style.length ) + _this.styleCss[ $( style).attr('id') ] = cssObject; + }); + + $( xml ).find( 'p' ).each( function( inx, p ){ + // Get text content by converting ttml node to html + var content = ''; + $.each( p.childNodes, function(inx, node){ + content+= _this.convertTTML2HTML( node ); + }); + // Get the end time: + var end = null; + if( $( p ).attr( 'end' ) ){ + end = mw.npt2seconds( $( p ).attr( 'end' ) ); + } + // Look for dur + if( !end && $( p ).attr( 'dur' )){ + end = mw.npt2seconds( $( p ).attr( 'begin' ) ) + + mw.npt2seconds( $( p ).attr( 'dur' ) ); + } + + // Create the caption object : + var captionObj ={ + 'start': mw.npt2seconds( $( p ).attr( 'begin' ) ), + 'end': end, + 'content': content + }; + + // See if we have custom metadata for position of this caption object + // there are 35 columns across and 15 rows high + var $meta = $(p).find( 'metadata' ); + if( $meta.length ){ + captionObj['css'] = { + 'position': 'absolute' + }; + if( $meta.attr('cccol') ){ + captionObj['css']['left'] = ( $meta.attr('cccol') / 35 ) * 100 +'%'; + // also means the width has to be reduced: + //captionObj['css']['width'] = 100 - parseInt( captionObj['css']['left'] ) + '%'; + } + if( $meta.attr('ccrow') ){ + captionObj['css']['top'] = ( $meta.attr('ccrow') / 15 ) * 100 +'%'; + } + } + if( $(p).attr('tts:textAlign') ){ + if( !captionObj['css'] ){ + captionObj['css'] = {}; + } + captionObj['css']['text-align'] = $(p).attr('tts:textAlign'); + + // Remove text align is "right" flip the css left: + if( captionObj['css']['text-align'] == 'right' && captionObj['css']['left'] ){ + //captionObj['css']['width'] = captionObj['css']['left']; + captionObj['css']['left'] = null; + } + } + + // check if this p has any style else use the body parent + if( $(p).attr('style') ){ + captionObj['styleId'] = $(p).attr('style') ; + } else { + captionObj['styleId'] = bodyStyleId; + } + captions.push( captionObj); + }); + return captions; + }, + convertTTML2HTML: function( node ){ + var _this = this; + + // look for text node: + if( node.nodeType == 3 ){ + return node.textContent; + } + // skip metadata nodes: + if( node.nodeName == 'metadata' ){ + return ''; + } + // if a br just append + if( node.nodeName == 'br' ){ + return '<br />'; + } + // Setup tts mappings TODO should be static property of a ttmlSource object. + var ttsStyleMap = { + 'tts:color' : 'color', + 'tts:fontWeight' : 'font-weight', + 'tts:fontStyle' : 'font-style' + }; + if( node.childNodes.length ){ + var nodeString = ''; + var styleVal = ''; + for( var attr in ttsStyleMap ){ + if( node.getAttribute( attr ) ){ + styleVal+= ttsStyleMap[ attr ] + ':' + node.getAttribute( attr ) + ';'; + } + } + nodeString += '<' + node.nodeName + ' style="' + styleVal + '" >'; + $.each( node.childNodes, function( inx, childNode ){ + nodeString += _this.convertTTML2HTML( childNode ); + }); + nodeString += '</' + node.nodeName + '>'; + return nodeString; + } + }, + /** + * srt timed text parse handle: + * @param {String} data Srt string to be parsed + */ + getCaptionsFromSrt: function ( data ){ + mw.log("TextSource::getCaptionsFromSrt"); + var _this = this; + // Check if the "srt" parses as an XML + try{ + var xml = $.parseXML( data ); + if( xml && $( xml ).find('parsererror').length == 0 ){ + return this.getCaptionsFromTMML( data ); + } + } catch ( e ){ + // srt should not be xml + } + // Remove dos newlines + var srt = data.replace(/\r+/g, ''); + + // Trim white space start and end + srt = srt.replace(/^\s+|\s+$/g, ''); + + // Remove all html tags for security reasons + srt = srt.replace(/<[a-zA-Z\/][^>]*>/g, ''); + + // Get captions + var captions = []; + var caplist = srt.split('\n\n'); + for (var i = 0; i < caplist.length; i++) { + var captionText = ""; + var caption = false; + captionText = caplist[i]; + var s = captionText.split(/\n/); + if (s.length < 2) { + // file format error or comment lines + continue; + } + if (s[0].match(/^\d+$/) && s[1].match(/\d+:\d+:\d+/)) { + // ignore caption number in s[0] + // parse time string + var m = s[1].match(/(\d+):(\d+):(\d+)(?:,(\d+))?\s*--?>\s*(\d+):(\d+):(\d+)(?:,(\d+))?/); + if (m) { + caption = _this.match2caption( m ); + } else { + // Unrecognized timestring + continue; + } + if( caption ){ + // concatenate text lines to html text + caption['content'] = s.slice(2).join("<br>"); + } + } else { + // file format error or comment lines + continue; + } + // Add the current caption to the captions set: + captions.push( caption ); + } + + return captions; + }, + + /** + * Get srts from a mediawiki html / srt string + * + * Right now wiki -> html is not always friendly to our srt parsing. + * The long term plan is to move the srt parsing to server side and have the api + * server up the srt's times in JSON form + * + * Also see https://bugzilla.wikimedia.org/show_bug.cgi?id=29126 + * + * TODO move to mediaWiki specific module. + */ + getCaptiosnFromMediaWikiSrt: function( data ){ + mw.log("TimedText::getCaptiosnFromMediaWikiSrt:"); + var _this = this; + var captions = [ ]; + var curentCap = { + 'content': '' + }; + var parseNextAsTime = false; + // Note this string concatenation and html error wrapping sometimes causes + // parse issues where the wikitext includes many native <p /> tags without child + // subtitles. In prating this is not a deal breakers because the wikitext for + // TimedText namespace and associated srts already has a specific format. + // Long term we will move to server side parsing. + $( '<div>' + data + '</div>' ).find('p').each( function() { + var currentPtext = $(this).html(); + //mw.log( 'pText: ' + currentPtext ); + + // We translate raw wikitext gennerated html into a matched srt time sample. + // The raw html looks like: + // # + // hh:mm:ss,ms --> hh:mm:ss,ms + // text + // + // You can read more about the srt format here: + // http://en.wikipedia.org/wiki/SubRip + // + // We attempt to be fairly robust in our regular expression to catch a few + // srt variations such as omition of commas and empty text lines. + var m = currentPtext + .replace('-->', '-->') // restore --> with --> for easier srt parsing: + .match(/\d+\s([\d\-]+):([\d\-]+):([\d\-]+)(?:,([\d\-]+))?\s*--?>\s*([\d\-]+):([\d\-]+):([\d\-]+)(?:,([\d\-]+))?\n?(.*)/); + + if (m) { + captions.push( + _this.match2caption( m ) + ); + return true; + } + + /*** + * Handle multi line sytle output + * + * Handles cases parse cases where an entire line can't be parsed in the single + * regular expression above, Since the diffrent captions pars are outputed in + * diffrent <p /> tags by the wikitext parser output. + */ + + // Check if we have reached the end of a multi line match + if( parseInt( currentPtext ) == currentPtext ) { + if( curentCap.content != '' ) { + captions.push( curentCap ); + } + // Clear out the current caption content + curentCap = { + 'content': '' + }; + return true; + } + // Check only for time match: + var m = currentPtext + .replace('-->', '-->') + .match(/(\d+):(\d+):(\d+)(?:,(\d+))?\s*--?>\s*(\d+):(\d+):(\d+)(?:,(\d+))?/); + if (m) { + // Update the currentCap: + curentCap = _this.match2caption( m ); + return true; + } + // Else append contnet for the curentCap + if( currentPtext != '<br>' ) { + curentCap['content'] += currentPtext; + } + }); + //Push last subtitle: + if( curentCap.length != 0) { + captions.push( curentCap ); + } + mw.log( "TimedText::getCaptiosnFromMediaWikiSrt found " + captions.length + ' captions'); + return captions; + }, + /** + * Takes a regular expresion match and converts it to a caption object + */ + match2caption: function( m ){ + var caption = {}; + // Look for ms: + var startMs = (m[4]) ? parseInt(m[4], 10) : 0; + var endMs = (m[8]) ? parseInt(m[8], 10) : 0; + caption['start'] = this.timeParts2seconds( m[1], m[2], m[3], startMs ); + caption['end'] = this.timeParts2seconds( m[5], m[6], m[7], endMs ); + if( m[9] ){ + caption['content'] = $.trim( m[9] ); + } + return caption; + }, + /** + * Takes time parts in hours, min, seconds and milliseconds and coverts to float seconds. + */ + timeParts2seconds: function( hours, min, sec, ms ){ + return mw.measurements2seconds({ + 'hours': hours, + 'minutes': min, + 'seconds' : sec, + 'milliseconds': ms + }); + } + }; + + +} )( mediaWiki, jQuery ); |