lib/cea/mp4_cea_parser.js

/*! @license
 * Shaka Player
 * Copyright 2016 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

goog.provide('shaka.cea.Mp4CeaParser');

goog.require('goog.asserts');
goog.require('shaka.cea.ICeaParser');
goog.require('shaka.cea.SeiProcessor');
goog.require('shaka.util.DataViewReader');
goog.require('shaka.util.Error');
goog.require('shaka.util.Mp4Parser');
goog.require('shaka.util.Mp4BoxParsers');

/**
 * MPEG4 stream parser used for extracting 708 closed captions data.
 * @implements {shaka.cea.ICeaParser}
 */
shaka.cea.Mp4CeaParser = class {
  /** */
  constructor() {
    /**
     * SEI data processor.
     * @private
     * @const {!shaka.cea.SeiProcessor}
     */
    this.seiProcessor_ = new shaka.cea.SeiProcessor();

    /**
     * Map of track id to corresponding timescale.
     * @private {!Map<number, number>}
     */
    this.trackIdToTimescale_ = new Map();

    /**
     * Default sample duration, as specified by the TREX box.
     * @private {number}
     */
    this.defaultSampleDuration_ = 0;

    /**
     * Default sample size, as specified by the TREX box.
     * @private {number}
     */
    this.defaultSampleSize_ = 0;
  }

  /**
   * Parses the init segment. Gets Default Sample Duration and Size from the
   * TREX box, and constructs a map of Track IDs to timescales. Each TRAK box
   * contains a track header (TKHD) containing track ID, and a media header box
   * (MDHD) containing the timescale for the track
   * @override
   */
  init(initSegment) {
    const Mp4Parser = shaka.util.Mp4Parser;
    const trackIds = [];
    const timescales = [];

    new Mp4Parser()
        .box('moov', Mp4Parser.children)
        .box('mvex', Mp4Parser.children)
        .fullBox('trex', (box) => {
          const parsedTREXBox = shaka.util.Mp4BoxParsers.parseTREX(
              box.reader);

          this.defaultSampleDuration_ = parsedTREXBox.defaultSampleDuration;
          this.defaultSampleSize_ = parsedTREXBox.defaultSampleSize;
        })
        .box('trak', Mp4Parser.children)
        .fullBox('tkhd', (box) => {
          goog.asserts.assert(
              box.version != null,
              'TKHD is a full box and should have a valid version.');
          const parsedTKHDBox = shaka.util.Mp4BoxParsers.parseTKHD(
              box.reader, box.version);
          trackIds.push(parsedTKHDBox.trackId);
        })
        .box('mdia', Mp4Parser.children)
        .fullBox('mdhd', (box) => {
          goog.asserts.assert(
              box.version != null,
              'MDHD is a full box and should have a valid version.');
          const parsedMDHDBox = shaka.util.Mp4BoxParsers.parseMDHD(
              box.reader, box.version);
          timescales.push(parsedMDHDBox.timescale);
        })
        .parse(initSegment, /* partialOkay= */ true);

    // At least one track should exist, and each track should have a
    // corresponding Id in TKHD box, and timescale in its MDHD box
    if (!trackIds.length|| !timescales.length ||
      trackIds.length != timescales.length) {
      throw new shaka.util.Error(
          shaka.util.Error.Severity.CRITICAL,
          shaka.util.Error.Category.TEXT,
          shaka.util.Error.Code.INVALID_MP4_CEA);
    }

    // Populate the map from track Id to timescale
    trackIds.forEach((trackId, idx) => {
      this.trackIdToTimescale_.set(trackId, timescales[idx]);
    });
  }

  /**
   * Parses each video segment. In fragmented MP4s, MOOF and MDAT come in
   * pairs. The following logic gets the necessary info from MOOFs to parse
   * MDATs (base media decode time, sample sizes/offsets/durations, etc),
   * and then parses the MDAT boxes for CEA-708 packets using this information.
   * CEA-708 packets are returned in the callback.
   * @override
   */
  parse(mediaSegment) {
    const Mp4Parser = shaka.util.Mp4Parser;

    /** @type {!Array<!shaka.cea.ICeaParser.CaptionPacket>} **/
    const captionPackets = [];

    // Fields that are found in MOOF boxes
    let defaultSampleDuration = this.defaultSampleDuration_;
    let defaultSampleSize = this.defaultSampleSize_;
    let sampleData = [];
    let baseMediaDecodeTime = null;
    let timescale = shaka.cea.ICeaParser.DEFAULT_TIMESCALE_VALUE;

    new Mp4Parser()
        .box('moof', Mp4Parser.children)
        .box('traf', Mp4Parser.children)
        .fullBox('trun', (box) => {
          goog.asserts.assert(
              box.version != null && box.flags!=null,
              'TRUN is a full box and should have a valid version & flags.');

          const parsedTRUN = shaka.util.Mp4BoxParsers.parseTRUN(
              box.reader, box.version, box.flags);

          sampleData = parsedTRUN.sampleData;
        })

        .fullBox('tfhd', (box) => {
          goog.asserts.assert(
              box.flags != null,
              'TFHD is a full box and should have valid flags.');

          const parsedTFHD = shaka.util.Mp4BoxParsers.parseTFHD(
              box.reader, box.flags);

          // If specified, defaultSampleDuration and defaultSampleSize
          // override the ones specified in the TREX box
          defaultSampleDuration = parsedTFHD.defaultSampleDuration ||
              this.defaultSampleDuration_;

          defaultSampleSize = parsedTFHD.defaultSampleSize ||
              this.defaultSampleSize_;

          const trackId = parsedTFHD.trackId;

          // Get the timescale from the track Id
          if (this.trackIdToTimescale_.has(trackId)) {
            timescale = this.trackIdToTimescale_.get(trackId);
          }
        })

        .fullBox('tfdt', (box) => {
          goog.asserts.assert(
              box.version != null,
              'TFDT is a full box and should have a valid version.');

          const parsedTFDT = shaka.util.Mp4BoxParsers.parseTFDT(
              box.reader, box.version);

          baseMediaDecodeTime = parsedTFDT.baseMediaDecodeTime;
        })
        .box('mdat', (box) => {
          if (baseMediaDecodeTime === null) {
            // This field should have been populated by
            // the Base Media Decode time in the TFDT box
            throw new shaka.util.Error(
                shaka.util.Error.Severity.CRITICAL,
                shaka.util.Error.Category.TEXT,
                shaka.util.Error.Code.INVALID_MP4_CEA);
          }
          this.parseMdat_(box.reader, baseMediaDecodeTime, timescale,
              defaultSampleDuration, defaultSampleSize, sampleData,
              captionPackets);
        })
        .parse(mediaSegment, /* partialOkay= */ false);

    return captionPackets;
  }

  /**
   * Parse MDAT box.
   * @param {!shaka.util.DataViewReader} reader
   * @param {number} time
   * @param {number} timescale
   * @param {number} defaultSampleDuration
   * @param {number} defaultSampleSize
   * @param {!Array<shaka.util.ParsedTRUNSample>} sampleData
   * @param {!Array<!shaka.cea.ICeaParser.CaptionPacket>} captionPackets
   * @private
   */
  parseMdat_(reader, time, timescale, defaultSampleDuration,
      defaultSampleSize, sampleData, captionPackets) {
    let sampleIndex = 0;

    // The fields in each ParsedTRUNSample contained in the sampleData
    // array are nullable. In the case of sample data and sample duration,
    // we use the defaults provided by the TREX/TFHD boxes. For sample
    // composition time offset, we default to 0.
    let sampleSize = defaultSampleSize;

    if (sampleData.length) {
      sampleSize = sampleData[0].sampleSize || defaultSampleSize;
    }

    while (reader.hasMoreData()) {
      const naluSize = reader.readUint32();
      const naluType = reader.readUint8() & 0x1F;
      if (naluType == shaka.cea.ICeaParser.NALU_TYPE_SEI) {
        let timeOffset = 0;

        if (sampleData.length > sampleIndex) {
          timeOffset = sampleData[sampleIndex].sampleCompositionTimeOffset || 0;
        }

        const pts = (time + timeOffset)/timescale;
        for (const packet of this.seiProcessor_
            .process(reader.readBytes(naluSize - 1))) {
          captionPackets.push({
            packet,
            pts,
          });
        }
      } else {
        try {
          reader.skip(naluSize - 1);
        } catch (e) {
          // It is necessary to ignore this error because it can break the start
          // of playback even if the user does not want to see the subtitles.
          break;
        }
      }
      sampleSize -= (naluSize + 4);
      if (sampleSize == 0) {
        if (sampleData.length > sampleIndex) {
          time += sampleData[sampleIndex].sampleDuration ||
              defaultSampleDuration;
        } else {
          time += defaultSampleDuration;
        }

        sampleIndex++;

        if (sampleData.length > sampleIndex) {
          sampleSize = sampleData[sampleIndex].sampleSize || defaultSampleSize;
        } else {
          sampleSize = defaultSampleSize;
        }
      }
    }
  }
};