diff --git a/lib/media/media_source_engine.js b/lib/media/media_source_engine.js
index 3994288a19..4325bdecd7 100644
--- a/lib/media/media_source_engine.js
+++ b/lib/media/media_source_engine.js
@@ -106,6 +106,12 @@ shaka.media.MediaSourceEngine = class {
 
     /** @private {string} */
     this.url_ = '';
+
+    /** @private {boolean} */
+    this.sequenceMode_ = false;
+
+    /** @private {!shaka.util.PublicPromise.<number>} */
+    this.textSequenceModeOffset_ = new shaka.util.PublicPromise();
   }
 
   /**
@@ -331,6 +337,8 @@ shaka.media.MediaSourceEngine = class {
 
     await this.mediaSourceOpen_;
 
+    this.sequenceMode_ = sequenceMode;
+
     for (const contentType of streamsByType.keys()) {
       const stream = streamsByType.get(contentType);
       goog.asserts.assert(
@@ -348,11 +356,9 @@ shaka.media.MediaSourceEngine = class {
           mimeType =
               shaka.media.Transmuxer.convertTsCodecs(contentType, mimeType);
         }
+
         const sourceBuffer = this.mediaSource_.addSourceBuffer(mimeType);
-        if (sequenceMode) {
-          sourceBuffer.mode =
-              shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
-        }
+
         this.eventManager_.listen(
             sourceBuffer, 'error',
             () => this.onError_(contentType));
@@ -515,35 +521,29 @@ shaka.media.MediaSourceEngine = class {
    * @param {?boolean} hasClosedCaptions True if the buffer contains CEA closed
    * captions
    * @param {boolean=} seeked True if we just seeked
-   * @param {boolean=} sequenceMode True if sequence mode
    * @return {!Promise}
    */
-  async appendBuffer(contentType, data, startTime, endTime, hasClosedCaptions,
-      seeked, sequenceMode) {
+  async appendBuffer(
+      contentType, data, startTime, endTime, hasClosedCaptions, seeked) {
     const ContentType = shaka.util.ManifestParserUtils.ContentType;
 
-    if (startTime != null && sequenceMode && contentType != ContentType.TEXT) {
-      // If we just cleared buffer and is on an unbuffered seek, we need to set
-      // the new timestampOffset of the sourceBuffer.
-      // Don't do this for text streams, though, since they don't use
-      // MediaSource anyway.
-      if (seeked) {
-        const timestampOffset = /** @type {number} */ (startTime);
-        this.enqueueOperation_(
-            contentType,
-            () => this.setTimestampOffset_(contentType, timestampOffset));
+    if (contentType == ContentType.TEXT) {
+      if (this.sequenceMode_) {
+        // This won't be known until the first video segment is appended.
+        const offset = await this.textSequenceModeOffset_;
+        this.textEngine_.setTimestampOffset(offset);
       }
+      await this.textEngine_.appendBuffer(data, startTime, endTime);
+      return;
     }
 
-    if (contentType == ContentType.TEXT) {
-      await this.textEngine_.appendBuffer(data, startTime, endTime);
-    } else if (this.transmuxers_[contentType]) {
+    if (this.transmuxers_[contentType]) {
       const transmuxedData =
           await this.transmuxers_[contentType].transmux(data);
       // For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in
       // the video stream, so textEngine may not have been initialized.
       if (!this.textEngine_) {
-        this.reinitText('text/vtt', sequenceMode || false);
+        this.reinitText('text/vtt', this.sequenceMode_);
       }
 
       if (transmuxedData.metadata) {
@@ -562,15 +562,10 @@ shaka.media.MediaSourceEngine = class {
             closedCaptions, startTime, endTime, videoOffset);
       }
 
-      let transmuxedSegment = transmuxedData.data;
-      transmuxedSegment = this.workAroundBrokenPlatforms_(
-          transmuxedSegment, startTime, contentType);
-
-      await this.enqueueOperation_(
-          contentType, () => this.append_(contentType, transmuxedSegment));
+      data = transmuxedData.data;
     } else if (hasClosedCaptions) {
       if (!this.textEngine_) {
-        this.reinitText('text/vtt', sequenceMode || false);
+        this.reinitText('text/vtt', this.sequenceMode_);
       }
       // If it is the init segment for closed captions, initialize the closed
       // caption parser.
@@ -585,19 +580,78 @@ shaka.media.MediaSourceEngine = class {
               closedCaptions, startTime, endTime, videoOffset);
         }
       }
+    }
 
-      data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
+    data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
+
+    const sourceBuffer = this.sourceBuffers_[contentType];
+    const SEQUENCE = shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
+
+    if (this.sequenceMode_ && sourceBuffer.mode != SEQUENCE &&
+        startTime != null) {
+      // This is the first media segment to be appended to a SourceBuffer in
+      // sequence mode.  We set the mode late so that we can trick MediaSource
+      // into extracting a timestamp for us to align text segments in sequence
+      // mode.
+
+      // Timestamps can only be reliably extracted from video, not audio.
+      // Packed audio formats do not have internal timestamps at all.
+      // Prefer video for this when available.
+      const isBestSourceBufferForTimestamps =
+          contentType == ContentType.VIDEO ||
+          !(ContentType.VIDEO in this.sourceBuffers_);
+      if (isBestSourceBufferForTimestamps) {
+        // Append the segment in segments mode first, with offset of 0 and an
+        // open append window.
+        const originalRange =
+            [sourceBuffer.appendWindowStart, sourceBuffer.appendWindowEnd];
+        sourceBuffer.appendWindowStart = 0;
+        sourceBuffer.appendWindowEnd = Infinity;
+
+        const originalOffset = sourceBuffer.timestampOffset;
+        sourceBuffer.timestampOffset = 0;
+
+        await this.enqueueOperation_(
+            contentType, () => this.append_(contentType, data));
+
+        // Reset the offset and append window.
+        sourceBuffer.timestampOffset = originalOffset;
+        sourceBuffer.appendWindowStart = originalRange[0];
+        sourceBuffer.appendWindowEnd = originalRange[1];
+
+        // Now get the timestamp of the segment and compute the offset for text
+        // segments.
+        const mediaStartTime = shaka.media.TimeRangesUtils.bufferStart(
+            this.getBuffered_(contentType));
+        const textOffset = (startTime || 0) - (mediaStartTime || 0);
+        this.textSequenceModeOffset_.resolve(textOffset);
+
+        // Finally, clear the buffer.
+        await this.enqueueOperation_(
+            contentType,
+            () => this.remove_(contentType, 0, this.mediaSource_.duration));
+      }
 
-      await this.enqueueOperation_(
-          contentType,
-          () => this.append_(contentType, data));
-    } else {
-      data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
+      // Now switch to sequence mode and fall through to our normal operations.
+      sourceBuffer.mode = SEQUENCE;
+    }
 
-      await this.enqueueOperation_(
-          contentType,
-          () => this.append_(contentType, data));
+    if (startTime != null && this.sequenceMode_ &&
+        contentType != ContentType.TEXT) {
+      // In sequence mode, for non-text streams, if we just cleared the buffer
+      // and are performing an unbuffered seek, we need to set a new
+      // timestampOffset on the sourceBuffer.
+      if (seeked) {
+        const timestampOffset = /** @type {number} */ (startTime);
+        this.enqueueOperation_(
+            contentType,
+            () => this.setTimestampOffset_(contentType, timestampOffset));
+      }
     }
+
+    await this.enqueueOperation_(
+        contentType,
+        () => this.append_(contentType, data));
   }
 
   /**
diff --git a/lib/media/streaming_engine.js b/lib/media/streaming_engine.js
index 3d79bdc645..c5814758dd 100644
--- a/lib/media/streaming_engine.js
+++ b/lib/media/streaming_engine.js
@@ -1605,8 +1605,7 @@ shaka.media.StreamingEngine = class {
         reference.syncTime == null ? reference.startTime : reference.syncTime,
         reference.endTime,
         hasClosedCaptions,
-        seeked,
-        this.manifest_.sequenceMode);
+        seeked);
     this.destroyer_.ensureNotDestroyed();
     shaka.log.v2(logPrefix, 'appended media segment');
   }
diff --git a/lib/text/vtt_text_parser.js b/lib/text/vtt_text_parser.js
index 51bd1ad02a..5bc923d488 100644
--- a/lib/text/vtt_text_parser.js
+++ b/lib/text/vtt_text_parser.js
@@ -68,13 +68,15 @@ shaka.text.VttTextParser = class {
     // to the beginning of each segment.
     // NOTE: "periodStart" is the timestamp offset applied via TextEngine.
     // It is no longer closely tied to periods, but the name stuck around.
+    // NOTE: This offset and the flag choosing its meaning have no effect on
+    // HLS content, which should use X-TIMESTAMP-MAP and periodStart instead.
     let offset = time.vttOffset;
 
-    // Do not honor the 'X-TIMESTAMP-MAP' value when in sequence mode.
-    // That is because it is used mainly (solely?) to account for the timestamp
-    // offset of the video/audio; when in sequence mode, we normalize that
-    // timestamp offset to 0, so we should not account for it.
-    if (blocks[0].includes('X-TIMESTAMP-MAP') && !this.sequenceMode_) {
+    // Only use 'X-TIMESTAMP-MAP' in sequence mode, as that is currently
+    // shorthand for HLS.  Note that an offset based on the first video
+    // timestamp has already been extracted, and appears in periodStart.
+    // The relative offset from X-TIMESTAMP-MAP will be added to that for HLS.
+    if (blocks[0].includes('X-TIMESTAMP-MAP') && this.sequenceMode_) {
       // https://bit.ly/2K92l7y
       // The 'X-TIMESTAMP-MAP' header is used in HLS to align text with
       // the rest of the media.
@@ -109,8 +111,6 @@ shaka.text.VttTextParser = class {
           mpegTime += shaka.text.VttTextParser.TS_ROLLOVER_;
         }
 
-        // Apple-encoded HLS content uses absolute timestamps, so assume the
-        // presence of the map tag means the content uses absolute timestamps.
         offset = time.periodStart + mpegTime / mpegTimescale - cueTime;
       }
     }
diff --git a/test/text/vtt_text_parser_unit.js b/test/text/vtt_text_parser_unit.js
index 1218a5c30f..31414973aa 100644
--- a/test/text/vtt_text_parser_unit.js
+++ b/test/text/vtt_text_parser_unit.js
@@ -535,7 +535,61 @@ describe('VttTextParser', () => {
         'Test\n\n' +
         '00:00:40.000 --> 00:00:50.000 line:-1\n' +
         'Test2',
-        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0});
+        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ true);
+  });
+
+  it('ignores X-TIMESTAMP-MAP header if not in sequence mode', () => {
+    verifyHelper(
+        [
+          {startTime: 20, endTime: 40, payload: 'Test'},
+          {startTime: 40, endTime: 50, payload: 'Test2'},
+        ],
+        'WEBVTT\n' +
+        'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:01:00:00.000\n\n' +
+        '00:00:20.000 --> 00:00:40.000 line:0\n' +
+        'Test\n\n' +
+        '00:00:40.000 --> 00:00:50.000 line:-1\n' +
+        'Test2',
+        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ false);
+  });
+
+  it('parses X-TIMESTAMP-MAP header with non-zero local base', () => {
+    verifyHelper(
+        [
+          {startTime: 1800, endTime: 1810, payload: 'Test'},
+          {startTime: 1820, endTime: 1830, payload: 'Test2'},
+        ],
+        // 162000000 = 30 * 60 * 90k = 30 minutes for the TS part of the map.
+        // The local (VTT) part of the map is 1 hour.
+        // So text times of 1 hour map to media times of 30 minutes.
+        'WEBVTT\n' +
+        'X-TIMESTAMP-MAP=MPEGTS:162000000,LOCAL:01:00:00.000\n\n' +
+        '01:00:00.000 --> 01:00:10.000 line:0\n' +
+        'Test\n\n' +
+        '01:00:20.000 --> 01:00:30.000 line:-1\n' +
+        'Test2',
+        {periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ true);
+  });
+
+  it('combines X-TIMESTAMP-MAP header with periodStart', () => {
+    verifyHelper(
+        [
+          {startTime: 130, endTime: 150, payload: 'Test'},
+          {startTime: 150, endTime: 160, payload: 'Test2'},
+        ],
+        // 900000 = 10 sec, so expect every timestamp to be 10
+        // seconds ahead of what is specified.
+        'WEBVTT\n' +
+        'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:00:00:00.000\n\n' +
+        '00:00:20.000 --> 00:00:40.000 line:0\n' +
+        'Test\n\n' +
+        '00:00:40.000 --> 00:00:50.000 line:-1\n' +
+        'Test2',
+        {periodStart: 100, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
+        /* sequenceMode= */ true);
   });
 
   it('handles timestamp rollover with X-TIMESTAMP-MAP header', () => {
@@ -551,7 +605,8 @@ describe('VttTextParser', () => {
         'Test',
         // Non-null segmentStart takes precedence over X-TIMESTAMP-MAP.
         // This protects us from rollover in the MPEGTS field.
-        {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0});
+        {periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0},
+        /* sequenceMode= */ true);
 
     verifyHelper(
         [
@@ -564,7 +619,8 @@ describe('VttTextParser', () => {
         'X-TIMESTAMP-MAP=MPEGTS:9745408,LOCAL:00:00:00.000\n\n' +
         '00:00:00.000 --> 00:00:02.000 line:0\n' +
         'Test2',
-        {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0});
+        {periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0},
+        /* sequenceMode= */ true);
   });
 
   it('supports global style blocks', () => {
@@ -978,11 +1034,14 @@ describe('VttTextParser', () => {
    * @param {!Array} cues
    * @param {string} text
    * @param {shaka.extern.TextParser.TimeContext} time
+   * @param {boolean=} sequenceMode
    */
-  function verifyHelper(cues, text, time) {
+  function verifyHelper(cues, text, time, sequenceMode = false) {
     const data =
         shaka.util.BufferUtils.toUint8(shaka.util.StringUtils.toUTF8(text));
-    const result = new shaka.text.VttTextParser().parseMedia(data, time);
+    const parser = new shaka.text.VttTextParser();
+    parser.setSequenceMode(sequenceMode);
+    const result = parser.parseMedia(data, time);
 
     const expected = cues.map((cue) => {
       if (cue.nestedCues) {