From 58182605a7da3c18a7331828c319c88446a13d52 Mon Sep 17 00:00:00 2001
From: Joey Parrish <joeyparrish@users.noreply.github.com>
Date: Wed, 11 May 2022 15:26:02 -0700
Subject: [PATCH] fix: Fix VTT cue timing in HLS (#4217)

Since the transition to sequence mode for HLS in v4.0.0, VTT cue
timings were broken.  This is mainly because VTT cue timing in HLS is
meant to be based on an offset from the media timestamps, and we
generally don't know those now that we use sequence mode.

To fix it, this change uses MediaSource segment mode for the very
first video segment as a way to extract the timestamp, then clears the
buffer, switches to sequence mode, and appends it again.  This lets us
get the timing data we need, while avoiding major drawbacks of the
previous HLS implementation:
 - We don't need to fetch segments upfront (which is high latency)
 - We don't need to fetch segments twice (once for timestamps, and
   once again to buffer)
 - We don't need to maintain parsers (which were complex and limited
   the formats we could support)

Closes #4191
---
 lib/media/media_source_engine.js  | 128 +++++++++++++++++++++---------
 lib/media/streaming_engine.js     |   3 +-
 lib/text/vtt_text_parser.js       |  14 ++--
 test/text/vtt_text_parser_unit.js |  69 ++++++++++++++--
 4 files changed, 163 insertions(+), 51 deletions(-)
diff --git a/lib/media/media_source_engine.js b/lib/media/media_source_engine.js
index 3994288a19..4325bdecd7 100644
--- a/lib/media/media_source_engine.js
+++ b/lib/media/media_source_engine.js
@@ -106,6 +106,12 @@ shaka.media.MediaSourceEngine = class {
 
     /** @private {string} */
     this.url_ = '';
+
+    /** @private {boolean} */
+    this.sequenceMode_ = false;
+
+    /** @private {!shaka.util.PublicPromise.<number>} */
+    this.textSequenceModeOffset_ = new shaka.util.PublicPromise();
   }
 
   /**
@@ -331,6 +337,8 @@ shaka.media.MediaSourceEngine = class {
 
     await this.mediaSourceOpen_;
 
+    this.sequenceMode_ = sequenceMode;
+
     for (const contentType of streamsByType.keys()) {
       const stream = streamsByType.get(contentType);
       goog.asserts.assert(
@@ -348,11 +356,9 @@ shaka.media.MediaSourceEngine = class {
           mimeType =
               shaka.media.Transmuxer.convertTsCodecs(contentType, mimeType);
         }
+
         const sourceBuffer = this.mediaSource_.addSourceBuffer(mimeType);
-        if (sequenceMode) {
-          sourceBuffer.mode =
-              shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
-        }
+
         this.eventManager_.listen(
             sourceBuffer, 'error',
             () => this.onError_(contentType));
@@ -515,35 +521,29 @@ shaka.media.MediaSourceEngine = class {
    * @param {?boolean} hasClosedCaptions True if the buffer contains CEA closed
    * captions
    * @param {boolean=} seeked True if we just seeked
-   * @param {boolean=} sequenceMode True if sequence mode
    * @return {!Promise}
    */
-  async appendBuffer(contentType, data, startTime, endTime, hasClosedCaptions,
-      seeked, sequenceMode) {
+  async appendBuffer(
+      contentType, data, startTime, endTime, hasClosedCaptions, seeked) {
     const ContentType = shaka.util.ManifestParserUtils.ContentType;
 
-    if (startTime != null && sequenceMode && contentType != ContentType.TEXT) {
-      // If we just cleared buffer and is on an unbuffered seek, we need to set
-      // the new timestampOffset of the sourceBuffer.
-      // Don't do this for text streams, though, since they don't use
-      // MediaSource anyway.
-      if (seeked) {
-        const timestampOffset = /** @type {number} */ (startTime);
-        this.enqueueOperation_(
-            contentType,
-            () => this.setTimestampOffset_(contentType, timestampOffset));
+    if (contentType == ContentType.TEXT) {
+      if (this.sequenceMode_) {
+        // This won't be known until the first video segment is appended.
+        const offset = await this.textSequenceModeOffset_;
+        this.textEngine_.setTimestampOffset(offset);
       }
+      await this.textEngine_.appendBuffer(data, startTime, endTime);
+      return;
     }
 
-    if (contentType == ContentType.TEXT) {
-      await this.textEngine_.appendBuffer(data, startTime, endTime);
-    } else if (this.transmuxers_[contentType]) {
+    if (this.transmuxers_[contentType]) {
       const transmuxedData =
           await this.transmuxers_[contentType].transmux(data);
       // For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in
       // the video stream, so textEngine may not have been initialized.
       if (!this.textEngine_) {
-        this.reinitText('text/vtt', sequenceMode || false);
+        this.reinitText('text/vtt', this.sequenceMode_);
       }
 
       if (transmuxedData.metadata) {
@@ -562,15 +562,10 @@ shaka.media.MediaSourceEngine = class {
             closedCaptions, startTime, endTime, videoOffset);
       }
 
-      let transmuxedSegment = transmuxedData.data;
-      transmuxedSegment = this.workAroundBrokenPlatforms_(
-          transmuxedSegment, startTime, contentType);
-
-      await this.enqueueOperation_(
-          contentType, () => this.append_(contentType, transmuxedSegment));
+      data = transmuxedData.data;
     } else if (hasClosedCaptions) {
       if (!this.textEngine_) {
-        this.reinitText('text/vtt', sequenceMode || false);
+        this.reinitText('text/vtt', this.sequenceMode_);
       }
       // If it is the init segment for closed captions, initialize the closed
       // caption parser.
@@ -585,19 +580,78 @@ shaka.media.MediaSourceEngine = class {
               closedCaptions, startTime, endTime, videoOffset);
         }
       }
+    }
 
-      data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
+    data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
+
+    const sourceBuffer = this.sourceBuffers_[contentType];
+    const SEQUENCE = shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
+
+    if (this.sequenceMode_ && sourceBuffer.mode != SEQUENCE &&
+        startTime != null) {
+      // This is the first media segment to be appended to a SourceBuffer in
+      // sequence mode.  We set the mode late so that we can trick MediaSource
+      // into extracting a timestamp for us to align text segments in sequence
+      // mode.
+
+      // Timestamps can only be reliably extracted from video, not audio.
+      // Packed audio formats do not have internal timestamps at all.
+      // Prefer video for this when available.
+      const isBestSourceBufferForTimestamps =
+          contentType == ContentType.VIDEO ||
+          !(ContentType.VIDEO in this.sourceBuffers_);
+      if (isBestSourceBufferForTimestamps) {
+        // Append the segment in segments mode first, with offset of 0 and an
+        // open append window.
+        const originalRange =
+            [sourceBuffer.appendWindowStart, sourceBuffer.appendWindowEnd];
+        sourceBuffer.appendWindowStart = 0;
+        sourceBuffer.appendWindowEnd = Infinity;
+
+        const originalOffset = sourceBuffer.timestampOffset;
+        sourceBuffer.timestampOffset = 0;
+
+        await this.enqueueOperation_(
+            contentType, () => this.append_(contentType, data));
+
+        // Reset the offset and append window.
+        sourceBuffer.timestampOffset = originalOffset;
+        sourceBuffer.appendWindowStart = originalRange[0];
+        sourceBuffer.appendWindowEnd = originalRange[1];
+
+        // Now get the timestamp of the segment and compute the offset for text
+        // segments.
+        const mediaStartTime = shaka.media.TimeRangesUtils.bufferStart(
+            this.getBuffered_(contentType));
+        const textOffset = (startTime || 0) - (mediaStartTime || 0);
+        this.textSequenceModeOffset_.resolve(textOffset);
+
+        // Finally, clear the buffer.
+        await this.enqueueOperation_(
+            contentType,
+            () => this.remove_(contentType, 0, this.mediaSource_.duration));
+      }
 
-      await this.enqueueOperation_(
-          contentType,
-          () => this.append_(contentType, data));
-    } else {
-      data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
+      // Now switch to sequence mode and fall through to our normal operations.
+      sourceBuffer.mode = SEQUENCE;
+    }
 
-      await this.enqueueOperation_(
-          contentType,
-          () => this.append_(contentType, data));
+    if (startTime != null && this.sequenceMode_ &&
+        contentType != ContentType.TEXT) {
+      // In sequence mode, for non-text streams, if we just cleared the buffer
+      // and are performing an unbuffered seek, we need to set a new
+      // timestampOffset on the sourceBuffer.
+      if (seeked) {
+        const timestampOffset = /** @type {number} */ (startTime);
+        this.enqueueOperation_(
+            contentType,
+            () => this.setTimestampOffset_(contentType, timestampOffset));
+      }
     }
+
+    await this.enqueueOperation_(
+        contentType,
+        () => this.append_(contentType, data));
   }
 
   /**
diff --git a/lib/media/streaming_engine.js b/lib/media/streaming_engine.js
index ef55a7fdad..9da8ef2a5a 100644
--- a/lib/media/streaming_engine.js
+++ b/lib/media/streaming_engine.js
@@ -1612,8 +1612,7 @@ shaka.media.StreamingEngine = class {
         reference.syncTime == null ? reference.startTime : reference.syncTime,
         reference.endTime,
         hasClosedCaptions,
-        seeked,
-        this.manifest_.sequenceMode);
+        seeked);
     this.destroyer_.ensureNotDestroyed();
     shaka.log.v2(logPrefix, 'appended media segment');
   }
diff --git a/lib/text/vtt_text_parser.js b/lib/text/vtt_text_parser.js
index 51bd1ad02a..5bc923d488 100644
--- a/lib/text/vtt_text_parser.js
+++ b/lib/text/vtt_text_parser.js
@@ -68,13 +68,15 @@ shaka.text.VttTextParser = class {
     // to the beginning of each segment.
     // NOTE: "periodStart" is the timestamp offset applied via TextEngine.
     // It is no longer closely tied to periods, but the name stuck around.
+    // NOTE: This offset and the flag choosing its meaning have no effect on
+    // HLS content, which should use X-TIMESTAMP-MAP and periodStart instead.
     let offset = time.vttOffset;
 
-    // Do not honor the 'X-TIMESTAMP-MAP' value when in sequence mode.
-    // That is because it is used mainly (solely?) to account for the timestamp
-    // offset of the video/audio; when in sequence mode, we normalize that
-    // timestamp offset to 0, so we should not account for it.
-    if (blocks[0].includes('X-TIMESTAMP-MAP') && !this.sequenceMode_) {
+    // Only use 'X-TIMESTAMP-MAP' in sequence mode, as that is currently
+    // shorthand for HLS.  Note that an offset based on the first video
+    // timestamp has already been extracted, and appears in periodStart.
+    // The relative offset from X-TIMESTAMP-MAP will be added to that for HLS.
+    if (blocks[0].includes('X-TIMESTAMP-MAP') && this.sequenceMode_) {
       // https://bit.ly/2K92l7y
       // The 'X-TIMESTAMP-MAP' header is used in HLS to align text with
       // the rest of the media.
@@ -109,8 +111,6 @@ shaka.text.VttTextParser = class {
           mpegTime += shaka.text.VttTextParser.TS_ROLLOVER_;
         }
 
-        // Apple-encoded HLS content uses absolute timestamps, so assume the
-        // presence of the map tag means the content uses absolute timestamps.
         offset = time.periodStart + mpegTime / mpegTimescale - cueTime;
       }
     }
diff --git a/test/text/vtt_text_parser_unit.js b/test/text/vtt_text_parser_unit.js
index 1218a5c30f..31414973aa 100644
--- a/test/text/vtt_text_parser_unit.js
+++ b/test/text/vtt_text_parser_unit.js
@@ -535,7 +535,61 @@ describe('VttTextParser', () => {
         'Test\n\n' +
         '00:00:40.000 --> 00:00:50.000 line:-1\n' +
         'Test2',
-        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0});
+        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ true);
+  });
+
+  it('ignores X-TIMESTAMP-MAP header if not in sequence mode', () => {
+    verifyHelper(
+        [
+          {startTime: 20, endTime: 40, payload: 'Test'},
+          {startTime: 40, endTime: 50, payload: 'Test2'},
+        ],
+        'WEBVTT\n' +
+        'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:01:00:00.000\n\n' +
+        '00:00:20.000 --> 00:00:40.000 line:0\n' +
+        'Test\n\n' +
+        '00:00:40.000 --> 00:00:50.000 line:-1\n' +
+        'Test2',
+        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ false);
+  });
+
+  it('parses X-TIMESTAMP-MAP header with non-zero local base', () => {
+    verifyHelper(
+        [
+          {startTime: 1800, endTime: 1810, payload: 'Test'},
+          {startTime: 1820, endTime: 1830, payload: 'Test2'},
+        ],
+        // 162000000 = 30 * 60 * 90k = 30 minutes for the TS part of the map.
+        // The local (VTT) part of the map is 1 hour.
+        // So text times of 1 hour map to media times of 30 minutes.
+        'WEBVTT\n' +
+        'X-TIMESTAMP-MAP=MPEGTS:162000000,LOCAL:01:00:00.000\n\n' +
+        '01:00:00.000 --> 01:00:10.000 line:0\n' +
+        'Test\n\n' +
+        '01:00:20.000 --> 01:00:30.000 line:-1\n' +
+        'Test2',
+        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ true);
+  });
+
+  it('combines X-TIMESTAMP-MAP header with periodStart', () => {
+    verifyHelper(
+        [
+          {startTime: 130, endTime: 150, payload: 'Test'},
+          {startTime: 150, endTime: 160, payload: 'Test2'},
+        ],
+        // 900000 = 10 sec, so expect every timestamp to be 10
+        // seconds ahead of what is specified.
+        'WEBVTT\n' +
+        'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:00:00:00.000\n\n' +
+        '00:00:20.000 --> 00:00:40.000 line:0\n' +
+        'Test\n\n' +
+        '00:00:40.000 --> 00:00:50.000 line:-1\n' +
+        'Test2',
+        {periodStart: 100, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ true);
   });
 
   it('handles timestamp rollover with X-TIMESTAMP-MAP header', () => {
@@ -551,7 +605,8 @@ describe('VttTextParser', () => {
         'Test',
         // Non-null segmentStart takes precedence over X-TIMESTAMP-MAP.
         // This protects us from rollover in the MPEGTS field.
-        {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0});
+        {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0},
+        /* sequenceMode= */ true);
 
     verifyHelper(
         [
@@ -564,7 +619,8 @@ describe('VttTextParser', () => {
         'X-TIMESTAMP-MAP=MPEGTS:9745408,LOCAL:00:00:00.000\n\n' +
         '00:00:00.000 --> 00:00:02.000 line:0\n' +
         'Test2',
-        {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0});
+        {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0},
+        /* sequenceMode= */ true);
   });
 
   it('supports global style blocks', () => {
@@ -978,11 +1034,14 @@ describe('VttTextParser', () => {
    * @param {!Array} cues
    * @param {string} text
    * @param {shaka.extern.TextParser.TimeContext} time
+   * @param {boolean=} sequenceMode
    */
-  function verifyHelper(cues, text, time) {
+  function verifyHelper(cues, text, time, sequenceMode = false) {
     const data =
         shaka.util.BufferUtils.toUint8(shaka.util.StringUtils.toUTF8(text));
-    const result = new shaka.text.VttTextParser().parseMedia(data, time);
+    const parser = new shaka.text.VttTextParser();
+    parser.setSequenceMode(sequenceMode);
+    const result = parser.parseMedia(data, time);
 
     const expected = cues.map((cue) => {
       if (cue.nestedCues) {