Revert "fix: Fix VTT cue timing in HLS (shaka-project#4217)"

This reverts commit 69d1c14.
nyanmisaka · Oct 6, 2022 · 344e865 · 344e865
1 parent caee130
commit 344e865
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 163 deletions.
diff --git a/lib/media/media_source_engine.js b/lib/media/media_source_engine.js
@@ -106,12 +106,6 @@ shaka.media.MediaSourceEngine = class {
 
     /** @private {string} */
     this.url_ = '';
-
-    /** @private {boolean} */
-    this.sequenceMode_ = false;
-
-    /** @private {!shaka.util.PublicPromise.<number>} */
-    this.textSequenceModeOffset_ = new shaka.util.PublicPromise();
   }
 
   /**
@@ -337,8 +331,6 @@ shaka.media.MediaSourceEngine = class {
 
     await this.mediaSourceOpen_;
 
-    this.sequenceMode_ = sequenceMode;
-
     for (const contentType of streamsByType.keys()) {
       const stream = streamsByType.get(contentType);
       goog.asserts.assert(
@@ -356,9 +348,11 @@ shaka.media.MediaSourceEngine = class {
           mimeType =
               shaka.media.Transmuxer.convertTsCodecs(contentType, mimeType);
         }
-
         const sourceBuffer = this.mediaSource_.addSourceBuffer(mimeType);
-
+        if (sequenceMode) {
+          sourceBuffer.mode =
+              shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
+        }
         this.eventManager_.listen(
             sourceBuffer, 'error',
             () => this.onError_(contentType));
@@ -521,29 +515,35 @@ shaka.media.MediaSourceEngine = class {
    * @param {?boolean} hasClosedCaptions True if the buffer contains CEA closed
    * captions
    * @param {boolean=} seeked True if we just seeked
+   * @param {boolean=} sequenceMode True if sequence mode
    * @return {!Promise}
    */
-  async appendBuffer(
-      contentType, data, startTime, endTime, hasClosedCaptions, seeked) {
+  async appendBuffer(contentType, data, startTime, endTime, hasClosedCaptions,
+      seeked, sequenceMode) {
     const ContentType = shaka.util.ManifestParserUtils.ContentType;
 
-    if (contentType == ContentType.TEXT) {
-      if (this.sequenceMode_) {
-        // This won't be known until the first video segment is appended.
-        const offset = await this.textSequenceModeOffset_;
-        this.textEngine_.setTimestampOffset(offset);
+    if (startTime != null && sequenceMode && contentType != ContentType.TEXT) {
+      // If we just cleared buffer and is on an unbuffered seek, we need to set
+      // the new timestampOffset of the sourceBuffer.
+      // Don't do this for text streams, though, since they don't use
+      // MediaSource anyway.
+      if (seeked) {
+        const timestampOffset = /** @type {number} */ (startTime);
+        this.enqueueOperation_(
+            contentType,
+            () => this.setTimestampOffset_(contentType, timestampOffset));
       }
-      await this.textEngine_.appendBuffer(data, startTime, endTime);
-      return;
     }
 
-    if (this.transmuxers_[contentType]) {
+    if (contentType == ContentType.TEXT) {
+      await this.textEngine_.appendBuffer(data, startTime, endTime);
+    } else if (this.transmuxers_[contentType]) {
       const transmuxedData =
           await this.transmuxers_[contentType].transmux(data);
       // For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in
       // the video stream, so textEngine may not have been initialized.
       if (!this.textEngine_) {
-        this.reinitText('text/vtt', this.sequenceMode_);
+        this.reinitText('text/vtt', sequenceMode || false);
       }
 
       if (transmuxedData.metadata) {
@@ -562,10 +562,15 @@ shaka.media.MediaSourceEngine = class {
             closedCaptions, startTime, endTime, videoOffset);
       }
 
-      data = transmuxedData.data;
+      let transmuxedSegment = transmuxedData.data;
+      transmuxedSegment = this.workAroundBrokenPlatforms_(
+          transmuxedSegment, startTime, contentType);
+
+      await this.enqueueOperation_(
+          contentType, () => this.append_(contentType, transmuxedSegment));
     } else if (hasClosedCaptions) {
       if (!this.textEngine_) {
-        this.reinitText('text/vtt', this.sequenceMode_);
+        this.reinitText('text/vtt', sequenceMode || false);
       }
       // If it is the init segment for closed captions, initialize the closed
       // caption parser.
@@ -580,78 +585,19 @@ shaka.media.MediaSourceEngine = class {
               closedCaptions, startTime, endTime, videoOffset);
         }
       }
-    }
 
-    data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
-
-    const sourceBuffer = this.sourceBuffers_[contentType];
-    const SEQUENCE = shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
-
-    if (this.sequenceMode_ && sourceBuffer.mode != SEQUENCE &&
-        startTime != null) {
-      // This is the first media segment to be appended to a SourceBuffer in
-      // sequence mode.  We set the mode late so that we can trick MediaSource
-      // into extracting a timestamp for us to align text segments in sequence
-      // mode.
-
-      // Timestamps can only be reliably extracted from video, not audio.
-      // Packed audio formats do not have internal timestamps at all.
-      // Prefer video for this when available.
-      const isBestSourceBufferForTimestamps =
-          contentType == ContentType.VIDEO ||
-          !(ContentType.VIDEO in this.sourceBuffers_);
-      if (isBestSourceBufferForTimestamps) {
-        // Append the segment in segments mode first, with offset of 0 and an
-        // open append window.
-        const originalRange =
-            [sourceBuffer.appendWindowStart, sourceBuffer.appendWindowEnd];
-        sourceBuffer.appendWindowStart = 0;
-        sourceBuffer.appendWindowEnd = Infinity;
-
-        const originalOffset = sourceBuffer.timestampOffset;
-        sourceBuffer.timestampOffset = 0;
-
-        await this.enqueueOperation_(
-            contentType, () => this.append_(contentType, data));
-
-        // Reset the offset and append window.
-        sourceBuffer.timestampOffset = originalOffset;
-        sourceBuffer.appendWindowStart = originalRange[0];
-        sourceBuffer.appendWindowEnd = originalRange[1];
-
-        // Now get the timestamp of the segment and compute the offset for text
-        // segments.
-        const mediaStartTime = shaka.media.TimeRangesUtils.bufferStart(
-            this.getBuffered_(contentType));
-        const textOffset = (startTime || 0) - (mediaStartTime || 0);
-        this.textSequenceModeOffset_.resolve(textOffset);
-
-        // Finally, clear the buffer.
-        await this.enqueueOperation_(
-            contentType,
-            () => this.remove_(contentType, 0, this.mediaSource_.duration));
-      }
+      data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
 
-      // Now switch to sequence mode and fall through to our normal operations.
-      sourceBuffer.mode = SEQUENCE;
-    }
+      await this.enqueueOperation_(
+          contentType,
+          () => this.append_(contentType, data));
+    } else {
+      data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
 
-    if (startTime != null && this.sequenceMode_ &&
-        contentType != ContentType.TEXT) {
-      // In sequence mode, for non-text streams, if we just cleared the buffer
-      // and are performing an unbuffered seek, we need to set a new
-      // timestampOffset on the sourceBuffer.
-      if (seeked) {
-        const timestampOffset = /** @type {number} */ (startTime);
-        this.enqueueOperation_(
-            contentType,
-            () => this.setTimestampOffset_(contentType, timestampOffset));
-      }
+      await this.enqueueOperation_(
+          contentType,
+          () => this.append_(contentType, data));
     }
-
-    await this.enqueueOperation_(
-        contentType,
-        () => this.append_(contentType, data));
   }
 
   /**

diff --git a/lib/media/streaming_engine.js b/lib/media/streaming_engine.js
@@ -1605,7 +1605,8 @@ shaka.media.StreamingEngine = class {
         reference.syncTime == null ? reference.startTime : reference.syncTime,
         reference.endTime,
         hasClosedCaptions,
-        seeked);
+        seeked,
+        this.manifest_.sequenceMode);
     this.destroyer_.ensureNotDestroyed();
     shaka.log.v2(logPrefix, 'appended media segment');
   }

diff --git a/lib/text/vtt_text_parser.js b/lib/text/vtt_text_parser.js
@@ -68,15 +68,13 @@ shaka.text.VttTextParser = class {
     // to the beginning of each segment.
     // NOTE: "periodStart" is the timestamp offset applied via TextEngine.
     // It is no longer closely tied to periods, but the name stuck around.
-    // NOTE: This offset and the flag choosing its meaning have no effect on
-    // HLS content, which should use X-TIMESTAMP-MAP and periodStart instead.
     let offset = time.vttOffset;
 
-    // Only use 'X-TIMESTAMP-MAP' in sequence mode, as that is currently
-    // shorthand for HLS.  Note that an offset based on the first video
-    // timestamp has already been extracted, and appears in periodStart.
-    // The relative offset from X-TIMESTAMP-MAP will be added to that for HLS.
-    if (blocks[0].includes('X-TIMESTAMP-MAP') && this.sequenceMode_) {
+    // Do not honor the 'X-TIMESTAMP-MAP' value when in sequence mode.
+    // That is because it is used mainly (solely?) to account for the timestamp
+    // offset of the video/audio; when in sequence mode, we normalize that
+    // timestamp offset to 0, so we should not account for it.
+    if (blocks[0].includes('X-TIMESTAMP-MAP') && !this.sequenceMode_) {
       // https://bit.ly/2K92l7y
       // The 'X-TIMESTAMP-MAP' header is used in HLS to align text with
       // the rest of the media.
@@ -111,6 +109,8 @@ shaka.text.VttTextParser = class {
           mpegTime += shaka.text.VttTextParser.TS_ROLLOVER_;
         }
 
+        // Apple-encoded HLS content uses absolute timestamps, so assume the
+        // presence of the map tag means the content uses absolute timestamps.
         offset = time.periodStart + mpegTime / mpegTimescale - cueTime;
       }
     }

diff --git a/test/text/vtt_text_parser_unit.js b/test/text/vtt_text_parser_unit.js
@@ -535,61 +535,7 @@ describe('VttTextParser', () => {
         'Test\n\n' +
         '00:00:40.000 --> 00:00:50.000 line:-1\n' +
         'Test2',
-        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
-        /* sequenceMode= */ true);
-  });
-
-  it('ignores X-TIMESTAMP-MAP header if not in sequence mode', () => {
-    verifyHelper(
-        [
-          {startTime: 20, endTime: 40, payload: 'Test'},
-          {startTime: 40, endTime: 50, payload: 'Test2'},
-        ],
-        'WEBVTT\n' +
-        'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:01:00:00.000\n\n' +
-        '00:00:20.000 --> 00:00:40.000 line:0\n' +
-        'Test\n\n' +
-        '00:00:40.000 --> 00:00:50.000 line:-1\n' +
-        'Test2',
-        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
-        /* sequenceMode= */ false);
-  });
-
-  it('parses X-TIMESTAMP-MAP header with non-zero local base', () => {
-    verifyHelper(
-        [
-          {startTime: 1800, endTime: 1810, payload: 'Test'},
-          {startTime: 1820, endTime: 1830, payload: 'Test2'},
-        ],
-        // 162000000 = 30 * 60 * 90k = 30 minutes for the TS part of the map.
-        // The local (VTT) part of the map is 1 hour.
-        // So text times of 1 hour map to media times of 30 minutes.
-        'WEBVTT\n' +
-        'X-TIMESTAMP-MAP=MPEGTS:162000000,LOCAL:01:00:00.000\n\n' +
-        '01:00:00.000 --> 01:00:10.000 line:0\n' +
-        'Test\n\n' +
-        '01:00:20.000 --> 01:00:30.000 line:-1\n' +
-        'Test2',
-        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
-        /* sequenceMode= */ true);
-  });
-
-  it('combines X-TIMESTAMP-MAP header with periodStart', () => {
-    verifyHelper(
-        [
-          {startTime: 130, endTime: 150, payload: 'Test'},
-          {startTime: 150, endTime: 160, payload: 'Test2'},
-        ],
-        // 900000 = 10 sec, so expect every timestamp to be 10
-        // seconds ahead of what is specified.
-        'WEBVTT\n' +
-        'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:00:00:00.000\n\n' +
-        '00:00:20.000 --> 00:00:40.000 line:0\n' +
-        'Test\n\n' +
-        '00:00:40.000 --> 00:00:50.000 line:-1\n' +
-        'Test2',
-        {periodStart: 100, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
-        /* sequenceMode= */ true);
+        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0});
   });
 
   it('handles timestamp rollover with X-TIMESTAMP-MAP header', () => {
@@ -605,8 +551,7 @@ describe('VttTextParser', () => {
         'Test',
         // Non-null segmentStart takes precedence over X-TIMESTAMP-MAP.
         // This protects us from rollover in the MPEGTS field.
-        {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0},
-        /* sequenceMode= */ true);
+        {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0});
 
     verifyHelper(
         [
@@ -619,8 +564,7 @@ describe('VttTextParser', () => {
         'X-TIMESTAMP-MAP=MPEGTS:9745408,LOCAL:00:00:00.000\n\n' +
         '00:00:00.000 --> 00:00:02.000 line:0\n' +
         'Test2',
-        {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0},
-        /* sequenceMode= */ true);
+        {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0});
   });
 
   it('supports global style blocks', () => {
@@ -1034,14 +978,11 @@ describe('VttTextParser', () => {
    * @param {!Array} cues
    * @param {string} text
    * @param {shaka.extern.TextParser.TimeContext} time
-   * @param {boolean=} sequenceMode
    */
-  function verifyHelper(cues, text, time, sequenceMode = false) {
+  function verifyHelper(cues, text, time) {
     const data =
         shaka.util.BufferUtils.toUint8(shaka.util.StringUtils.toUTF8(text));
-    const parser = new shaka.text.VttTextParser();
-    parser.setSequenceMode(sequenceMode);
-    const result = parser.parseMedia(data, time);
+    const result = new shaka.text.VttTextParser().parseMedia(data, time);
 
     const expected = cues.map((cue) => {
       if (cue.nestedCues) {