diff --git a/lib/media/media_source_engine.js b/lib/media/media_source_engine.js index 3994288a19..4325bdecd7 100644 --- a/lib/media/media_source_engine.js +++ b/lib/media/media_source_engine.js @@ -106,6 +106,12 @@ shaka.media.MediaSourceEngine = class { /** @private {string} */ this.url_ = ''; + + /** @private {boolean} */ + this.sequenceMode_ = false; + + /** @private {!shaka.util.PublicPromise.} */ + this.textSequenceModeOffset_ = new shaka.util.PublicPromise(); } /** @@ -331,6 +337,8 @@ shaka.media.MediaSourceEngine = class { await this.mediaSourceOpen_; + this.sequenceMode_ = sequenceMode; + for (const contentType of streamsByType.keys()) { const stream = streamsByType.get(contentType); goog.asserts.assert( @@ -348,11 +356,9 @@ shaka.media.MediaSourceEngine = class { mimeType = shaka.media.Transmuxer.convertTsCodecs(contentType, mimeType); } + const sourceBuffer = this.mediaSource_.addSourceBuffer(mimeType); - if (sequenceMode) { - sourceBuffer.mode = - shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE; - } + this.eventManager_.listen( sourceBuffer, 'error', () => this.onError_(contentType)); @@ -515,35 +521,29 @@ shaka.media.MediaSourceEngine = class { * @param {?boolean} hasClosedCaptions True if the buffer contains CEA closed * captions * @param {boolean=} seeked True if we just seeked - * @param {boolean=} sequenceMode True if sequence mode * @return {!Promise} */ - async appendBuffer(contentType, data, startTime, endTime, hasClosedCaptions, - seeked, sequenceMode) { + async appendBuffer( + contentType, data, startTime, endTime, hasClosedCaptions, seeked) { const ContentType = shaka.util.ManifestParserUtils.ContentType; - if (startTime != null && sequenceMode && contentType != ContentType.TEXT) { - // If we just cleared buffer and is on an unbuffered seek, we need to set - // the new timestampOffset of the sourceBuffer. - // Don't do this for text streams, though, since they don't use - // MediaSource anyway. - if (seeked) { - const timestampOffset = /** @type {number} */ (startTime); - this.enqueueOperation_( - contentType, - () => this.setTimestampOffset_(contentType, timestampOffset)); + if (contentType == ContentType.TEXT) { + if (this.sequenceMode_) { + // This won't be known until the first video segment is appended. + const offset = await this.textSequenceModeOffset_; + this.textEngine_.setTimestampOffset(offset); } + await this.textEngine_.appendBuffer(data, startTime, endTime); + return; } - if (contentType == ContentType.TEXT) { - await this.textEngine_.appendBuffer(data, startTime, endTime); - } else if (this.transmuxers_[contentType]) { + if (this.transmuxers_[contentType]) { const transmuxedData = await this.transmuxers_[contentType].transmux(data); // For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in // the video stream, so textEngine may not have been initialized. if (!this.textEngine_) { - this.reinitText('text/vtt', sequenceMode || false); + this.reinitText('text/vtt', this.sequenceMode_); } if (transmuxedData.metadata) { @@ -562,15 +562,10 @@ shaka.media.MediaSourceEngine = class { closedCaptions, startTime, endTime, videoOffset); } - let transmuxedSegment = transmuxedData.data; - transmuxedSegment = this.workAroundBrokenPlatforms_( - transmuxedSegment, startTime, contentType); - - await this.enqueueOperation_( - contentType, () => this.append_(contentType, transmuxedSegment)); + data = transmuxedData.data; } else if (hasClosedCaptions) { if (!this.textEngine_) { - this.reinitText('text/vtt', sequenceMode || false); + this.reinitText('text/vtt', this.sequenceMode_); } // If it is the init segment for closed captions, initialize the closed // caption parser. @@ -585,19 +580,78 @@ shaka.media.MediaSourceEngine = class { closedCaptions, startTime, endTime, videoOffset); } } + } - data = this.workAroundBrokenPlatforms_(data, startTime, contentType); + data = this.workAroundBrokenPlatforms_(data, startTime, contentType); + + const sourceBuffer = this.sourceBuffers_[contentType]; + const SEQUENCE = shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE; + + if (this.sequenceMode_ && sourceBuffer.mode != SEQUENCE && + startTime != null) { + // This is the first media segment to be appended to a SourceBuffer in + // sequence mode. We set the mode late so that we can trick MediaSource + // into extracting a timestamp for us to align text segments in sequence + // mode. + + // Timestamps can only be reliably extracted from video, not audio. + // Packed audio formats do not have internal timestamps at all. + // Prefer video for this when available. + const isBestSourceBufferForTimestamps = + contentType == ContentType.VIDEO || + !(ContentType.VIDEO in this.sourceBuffers_); + if (isBestSourceBufferForTimestamps) { + // Append the segment in segments mode first, with offset of 0 and an + // open append window. + const originalRange = + [sourceBuffer.appendWindowStart, sourceBuffer.appendWindowEnd]; + sourceBuffer.appendWindowStart = 0; + sourceBuffer.appendWindowEnd = Infinity; + + const originalOffset = sourceBuffer.timestampOffset; + sourceBuffer.timestampOffset = 0; + + await this.enqueueOperation_( + contentType, () => this.append_(contentType, data)); + + // Reset the offset and append window. + sourceBuffer.timestampOffset = originalOffset; + sourceBuffer.appendWindowStart = originalRange[0]; + sourceBuffer.appendWindowEnd = originalRange[1]; + + // Now get the timestamp of the segment and compute the offset for text + // segments. + const mediaStartTime = shaka.media.TimeRangesUtils.bufferStart( + this.getBuffered_(contentType)); + const textOffset = (startTime || 0) - (mediaStartTime || 0); + this.textSequenceModeOffset_.resolve(textOffset); + + // Finally, clear the buffer. + await this.enqueueOperation_( + contentType, + () => this.remove_(contentType, 0, this.mediaSource_.duration)); + } - await this.enqueueOperation_( - contentType, - () => this.append_(contentType, data)); - } else { - data = this.workAroundBrokenPlatforms_(data, startTime, contentType); + // Now switch to sequence mode and fall through to our normal operations. + sourceBuffer.mode = SEQUENCE; + } - await this.enqueueOperation_( - contentType, - () => this.append_(contentType, data)); + if (startTime != null && this.sequenceMode_ && + contentType != ContentType.TEXT) { + // In sequence mode, for non-text streams, if we just cleared the buffer + // and are performing an unbuffered seek, we need to set a new + // timestampOffset on the sourceBuffer. + if (seeked) { + const timestampOffset = /** @type {number} */ (startTime); + this.enqueueOperation_( + contentType, + () => this.setTimestampOffset_(contentType, timestampOffset)); + } } + + await this.enqueueOperation_( + contentType, + () => this.append_(contentType, data)); } /** diff --git a/lib/media/streaming_engine.js b/lib/media/streaming_engine.js index 3d79bdc645..c5814758dd 100644 --- a/lib/media/streaming_engine.js +++ b/lib/media/streaming_engine.js @@ -1605,8 +1605,7 @@ shaka.media.StreamingEngine = class { reference.syncTime == null ? reference.startTime : reference.syncTime, reference.endTime, hasClosedCaptions, - seeked, - this.manifest_.sequenceMode); + seeked); this.destroyer_.ensureNotDestroyed(); shaka.log.v2(logPrefix, 'appended media segment'); } diff --git a/lib/text/vtt_text_parser.js b/lib/text/vtt_text_parser.js index 51bd1ad02a..5bc923d488 100644 --- a/lib/text/vtt_text_parser.js +++ b/lib/text/vtt_text_parser.js @@ -68,13 +68,15 @@ shaka.text.VttTextParser = class { // to the beginning of each segment. // NOTE: "periodStart" is the timestamp offset applied via TextEngine. // It is no longer closely tied to periods, but the name stuck around. + // NOTE: This offset and the flag choosing its meaning have no effect on + // HLS content, which should use X-TIMESTAMP-MAP and periodStart instead. let offset = time.vttOffset; - // Do not honor the 'X-TIMESTAMP-MAP' value when in sequence mode. - // That is because it is used mainly (solely?) to account for the timestamp - // offset of the video/audio; when in sequence mode, we normalize that - // timestamp offset to 0, so we should not account for it. - if (blocks[0].includes('X-TIMESTAMP-MAP') && !this.sequenceMode_) { + // Only use 'X-TIMESTAMP-MAP' in sequence mode, as that is currently + // shorthand for HLS. Note that an offset based on the first video + // timestamp has already been extracted, and appears in periodStart. + // The relative offset from X-TIMESTAMP-MAP will be added to that for HLS. + if (blocks[0].includes('X-TIMESTAMP-MAP') && this.sequenceMode_) { // https://bit.ly/2K92l7y // The 'X-TIMESTAMP-MAP' header is used in HLS to align text with // the rest of the media. @@ -109,8 +111,6 @@ shaka.text.VttTextParser = class { mpegTime += shaka.text.VttTextParser.TS_ROLLOVER_; } - // Apple-encoded HLS content uses absolute timestamps, so assume the - // presence of the map tag means the content uses absolute timestamps. offset = time.periodStart + mpegTime / mpegTimescale - cueTime; } } diff --git a/test/text/vtt_text_parser_unit.js b/test/text/vtt_text_parser_unit.js index 1218a5c30f..31414973aa 100644 --- a/test/text/vtt_text_parser_unit.js +++ b/test/text/vtt_text_parser_unit.js @@ -535,7 +535,61 @@ describe('VttTextParser', () => { 'Test\n\n' + '00:00:40.000 --> 00:00:50.000 line:-1\n' + 'Test2', - {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0}); + {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0}, + /* sequenceMode= */ true); + }); + + it('ignores X-TIMESTAMP-MAP header if not in sequence mode', () => { + verifyHelper( + [ + {startTime: 20, endTime: 40, payload: 'Test'}, + {startTime: 40, endTime: 50, payload: 'Test2'}, + ], + 'WEBVTT\n' + + 'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:01:00:00.000\n\n' + + '00:00:20.000 --> 00:00:40.000 line:0\n' + + 'Test\n\n' + + '00:00:40.000 --> 00:00:50.000 line:-1\n' + + 'Test2', + {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0}, + /* sequenceMode= */ false); + }); + + it('parses X-TIMESTAMP-MAP header with non-zero local base', () => { + verifyHelper( + [ + {startTime: 1800, endTime: 1810, payload: 'Test'}, + {startTime: 1820, endTime: 1830, payload: 'Test2'}, + ], + // 162000000 = 30 * 60 * 90k = 30 minutes for the TS part of the map. + // The local (VTT) part of the map is 1 hour. + // So text times of 1 hour map to media times of 30 minutes. + 'WEBVTT\n' + + 'X-TIMESTAMP-MAP=MPEGTS:162000000,LOCAL:01:00:00.000\n\n' + + '01:00:00.000 --> 01:00:10.000 line:0\n' + + 'Test\n\n' + + '01:00:20.000 --> 01:00:30.000 line:-1\n' + + 'Test2', + {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0}, + /* sequenceMode= */ true); + }); + + it('combines X-TIMESTAMP-MAP header with periodStart', () => { + verifyHelper( + [ + {startTime: 130, endTime: 150, payload: 'Test'}, + {startTime: 150, endTime: 160, payload: 'Test2'}, + ], + // 900000 = 10 sec, so expect every timestamp to be 10 + // seconds ahead of what is specified. + 'WEBVTT\n' + + 'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:00:00:00.000\n\n' + + '00:00:20.000 --> 00:00:40.000 line:0\n' + + 'Test\n\n' + + '00:00:40.000 --> 00:00:50.000 line:-1\n' + + 'Test2', + {periodStart: 100, segmentStart: 25, segmentEnd: 65, vttOffset: 0}, + /* sequenceMode= */ true); }); it('handles timestamp rollover with X-TIMESTAMP-MAP header', () => { @@ -551,7 +605,8 @@ describe('VttTextParser', () => { 'Test', // Non-null segmentStart takes precedence over X-TIMESTAMP-MAP. // This protects us from rollover in the MPEGTS field. - {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0}); + {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0}, + /* sequenceMode= */ true); verifyHelper( [ @@ -564,7 +619,8 @@ describe('VttTextParser', () => { 'X-TIMESTAMP-MAP=MPEGTS:9745408,LOCAL:00:00:00.000\n\n' + '00:00:00.000 --> 00:00:02.000 line:0\n' + 'Test2', - {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0}); + {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0}, + /* sequenceMode= */ true); }); it('supports global style blocks', () => { @@ -978,11 +1034,14 @@ describe('VttTextParser', () => { * @param {!Array} cues * @param {string} text * @param {shaka.extern.TextParser.TimeContext} time + * @param {boolean=} sequenceMode */ - function verifyHelper(cues, text, time) { + function verifyHelper(cues, text, time, sequenceMode = false) { const data = shaka.util.BufferUtils.toUint8(shaka.util.StringUtils.toUTF8(text)); - const result = new shaka.text.VttTextParser().parseMedia(data, time); + const parser = new shaka.text.VttTextParser(); + parser.setSequenceMode(sequenceMode); + const result = parser.parseMedia(data, time); const expected = cues.map((cue) => { if (cue.nestedCues) {