Add support for CMAF SEI Captions

Resolves #2623 (Add CMAF CC support) Resolves #4242 (Fix parsing multiple user data unregistered SEI) Closes #4421 (Does hls.js support extracting SEI frame from videos?) Fixes #4317 (Progressive mode is broken with fmp4 playback)
video-dev · Dec 15, 2021 · 864b55d · 864b55d
1 parent 3e59a8a
commit 864b55d
Show file tree

Hide file tree

Showing 14 changed files with 668 additions and 388 deletions.
diff --git a/api-extractor/report/hls.js.api.md b/api-extractor/report/hls.js.api.md
@@ -2143,9 +2143,19 @@ export type TSDemuxerConfig = {
 // @public (undocumented)
 export interface UserdataSample {
     // (undocumented)
-    bytes: Uint8Array;
+    bytes?: Uint8Array;
+    // (undocumented)
+    payloadType?: number;
     // (undocumented)
     pts: number;
+    // (undocumented)
+    type?: number;
+    // (undocumented)
+    userData?: string;
+    // (undocumented)
+    userDataBytes?: Uint8Array;
+    // (undocumented)
+    uuid?: string;
 }
 
 // Warnings were encountered during analysis:

diff --git a/src/controller/timeline-controller.ts b/src/controller/timeline-controller.ts
@@ -664,23 +664,25 @@ export class TimelineController implements ComponentAPI {
   }
 
   private extractCea608Data(byteArray: Uint8Array): number[][] {
-    const count = byteArray[0] & 31;
-    let position = 2;
     const actualCCBytes: number[][] = [[], []];
+    const count = byteArray[0] & 0x1f;
+    let position = 2;
 
     for (let j = 0; j < count; j++) {
       const tmpByte = byteArray[position++];
       const ccbyte1 = 0x7f & byteArray[position++];
       const ccbyte2 = 0x7f & byteArray[position++];
-      const ccValid = (4 & tmpByte) !== 0;
-      const ccType = 3 & tmpByte;
-
       if (ccbyte1 === 0 && ccbyte2 === 0) {
         continue;
       }
-
+      const ccValid = (0x04 & tmpByte) !== 0; // Support all four channels
       if (ccValid) {
-        if (ccType === 0 || ccType === 1) {
+        const ccType = 0x03 & tmpByte;
+        if (
+          0x00 /* CEA608 field1*/ === ccType ||
+          0x01 /* CEA608 field2*/ === ccType
+        ) {
+          // Exclude CEA708 CC data.
           actualCCBytes[ccType].push(ccbyte1);
           actualCCBytes[ccType].push(ccbyte2);
         }

diff --git a/src/demux/aacdemuxer.ts b/src/demux/aacdemuxer.ts
@@ -19,8 +19,13 @@ class AACDemuxer extends BaseAudioDemuxer {
     this.config = config;
   }
 
-  resetInitSegment(audioCodec, videoCodec, duration) {
-    super.resetInitSegment(audioCodec, videoCodec, duration);
+  resetInitSegment(
+    initSegment: Uint8Array | undefined,
+    audioCodec: string | undefined,
+    videoCodec: string | undefined,
+    trackDuration: number
+  ) {
+    super.resetInitSegment(initSegment, audioCodec, videoCodec, trackDuration);
     this._audioTrack = {
       container: 'audio/adts',
       type: 'audio',
@@ -30,7 +35,7 @@ class AACDemuxer extends BaseAudioDemuxer {
       isAAC: true,
       samples: [],
       manifestCodec: audioCodec,
-      duration: duration,
+      duration: trackDuration,
       inputTimeScale: 90000,
       dropped: 0,
     };

diff --git a/src/demux/base-audio-demuxer.ts b/src/demux/base-audio-demuxer.ts
@@ -5,7 +5,7 @@ import type {
   DemuxedAudioTrack,
   AudioFrame,
   DemuxedMetadataTrack,
-  DemuxedAvcTrack,
+  DemuxedVideoTrack,
   DemuxedUserdataTrack,
   KeyData,
 } from '../types/demuxer';
@@ -20,7 +20,12 @@ class BaseAudioDemuxer implements Demuxer {
   protected cachedData: Uint8Array | null = null;
   protected initPTS: number | null = null;
 
-  resetInitSegment(audioCodec: string, videoCodec: string, duration: number) {
+  resetInitSegment(
+    initSegment: Uint8Array | undefined,
+    audioCodec: string | undefined,
+    videoCodec: string | undefined,
+    trackDuration: number
+  ) {
     this._id3Track = {
       type: 'id3',
       id: 3,
@@ -109,7 +114,7 @@ class BaseAudioDemuxer implements Demuxer {
 
     return {
       audioTrack: track,
-      avcTrack: dummyTrack() as DemuxedAvcTrack,
+      videoTrack: dummyTrack() as DemuxedVideoTrack,
       id3Track,
       textTrack: dummyTrack() as DemuxedUserdataTrack,
     };
@@ -137,7 +142,7 @@ class BaseAudioDemuxer implements Demuxer {
 
     return {
       audioTrack: this._audioTrack,
-      avcTrack: dummyTrack() as DemuxedAvcTrack,
+      videoTrack: dummyTrack() as DemuxedVideoTrack,
       id3Track: this._id3Track,
       textTrack: dummyTrack() as DemuxedUserdataTrack,
     };

diff --git a/src/demux/dummy-demuxed-track.ts b/src/demux/dummy-demuxed-track.ts
@@ -1,11 +1,11 @@
 import type { DemuxedTrack } from '../types/demuxer';
 
-export function dummyTrack(): DemuxedTrack {
+export function dummyTrack(type = '', inputTimeScale = 90000): DemuxedTrack {
   return {
-    type: '',
+    type,
     id: -1,
     pid: -1,
-    inputTimeScale: 90000,
+    inputTimeScale,
     sequenceNumber: -1,
     samples: [],
     dropped: 0,

diff --git a/src/demux/mp3demuxer.ts b/src/demux/mp3demuxer.ts
@@ -9,8 +9,13 @@ import * as MpegAudio from './mpegaudio';
 class MP3Demuxer extends BaseAudioDemuxer {
   static readonly minProbeByteLength: number = 4;
 
-  resetInitSegment(audioCodec, videoCodec, duration) {
-    super.resetInitSegment(audioCodec, videoCodec, duration);
+  resetInitSegment(
+    initSegment: Uint8Array | undefined,
+    audioCodec: string | undefined,
+    videoCodec: string | undefined,
+    trackDuration: number
+  ) {
+    super.resetInitSegment(initSegment, audioCodec, videoCodec, trackDuration);
     this._audioTrack = {
       container: 'audio/mpeg',
       type: 'audio',
@@ -20,7 +25,7 @@ class MP3Demuxer extends BaseAudioDemuxer {
       isAAC: false,
       samples: [],
       manifestCodec: audioCodec,
-      duration: duration,
+      duration: trackDuration,
       inputTimeScale: 90000,
       dropped: 0,
     };

diff --git a/src/demux/mp4demuxer.ts b/src/demux/mp4demuxer.ts
@@ -4,7 +4,7 @@
 import {
   Demuxer,
   DemuxerResult,
-  PassthroughVideoTrack,
+  PassthroughTrack,
   DemuxedAudioTrack,
   DemuxedUserdataTrack,
   DemuxedMetadataTrack,
@@ -15,6 +15,9 @@ import {
   segmentValidRange,
   appendUint8Array,
   parseEmsg,
+  parseSamples,
+  parseInitSegment,
+  RemuxerTrackIdConfig,
 } from '../utils/mp4-tools';
 import { dummyTrack } from './dummy-demuxed-track';
 import type { HlsEventEmitter } from '../events';
@@ -25,87 +28,147 @@ const emsgSchemePattern = /\/emsg[-/]ID3/i;
 class MP4Demuxer implements Demuxer {
   static readonly minProbeByteLength = 1024;
   private remainderData: Uint8Array | null = null;
+  private timeOffset: number = 0;
   private config: HlsConfig;
+  private videoTrack?: PassthroughTrack;
+  private audioTrack?: DemuxedAudioTrack;
+  private id3Track?: DemuxedMetadataTrack;
+  private txtTrack?: DemuxedUserdataTrack;
 
   constructor(observer: HlsEventEmitter, config: HlsConfig) {
     this.config = config;
   }
 
-  resetTimeStamp() {}
+  public resetTimeStamp() {}
 
-  resetInitSegment() {}
+  public resetInitSegment(
+    initSegment: Uint8Array,
+    audioCodec: string | undefined,
+    videoCodec: string | undefined,
+    trackDuration: number
+  ) {
+    const initData = parseInitSegment(initSegment);
+    const videoTrack = (this.videoTrack = dummyTrack(
+      'video',
+      1
+    ) as PassthroughTrack);
+    const audioTrack = (this.audioTrack = dummyTrack(
+      'audio',
+      1
+    ) as DemuxedAudioTrack);
+    const captionTrack = (this.txtTrack = dummyTrack(
+      'text',
+      1
+    ) as DemuxedUserdataTrack);
 
-  resetContiguity(): void {}
+    this.id3Track = dummyTrack('id3', 1) as DemuxedMetadataTrack;
+    this.timeOffset = 0;
 
-  static probe(data) {
+    if (initData.video) {
+      const { id, timescale, codec } = initData.video;
+      videoTrack.id = id;
+      videoTrack.timescale = captionTrack.timescale = timescale;
+      videoTrack.codec = codec;
+    }
+
+    if (initData.audio) {
+      const { id, timescale, codec } = initData.audio;
+      audioTrack.id = id;
+      audioTrack.timescale = timescale;
+      audioTrack.codec = codec;
+    }
+
+    captionTrack.id = RemuxerTrackIdConfig.text;
+    videoTrack.sampleDuration = 0;
+    videoTrack.duration = audioTrack.duration = trackDuration;
+  }
+
+  public resetContiguity(): void {}
+
+  static probe(data: Uint8Array) {
     // ensure we find a moof box in the first 16 kB
-    return (
-      findBox({ data: data, start: 0, end: Math.min(data.length, 16384) }, [
-        'moof',
-      ]).length > 0
-    );
+    data = data.length > 16384 ? data.subarray(0, 16384) : data;
+    return findBox(data, ['moof']).length > 0;
   }
 
-  demux(data: Uint8Array, timeOffset: number): DemuxerResult {
+  public demux(data: Uint8Array, timeOffset: number): DemuxerResult {
+    this.timeOffset = timeOffset;
     // Load all data into the avc track. The CMAF remuxer will look for the data in the samples object; the rest of the fields do not matter
-    let avcSamples = data;
-    const avcTrack = dummyTrack() as PassthroughVideoTrack;
+    let videoSamples = data;
+    const videoTrack = this.videoTrack as PassthroughTrack;
+    const textTrack = this.txtTrack as DemuxedUserdataTrack;
     if (this.config.progressive) {
       // Split the bytestream into two ranges: one encompassing all data up until the start of the last moof, and everything else.
       // This is done to guarantee that we're sending valid data to MSE - when demuxing progressively, we have no guarantee
       // that the fetch loader gives us flush moof+mdat pairs. If we push jagged data to MSE, it will throw an exception.
       if (this.remainderData) {
-        avcSamples = appendUint8Array(this.remainderData, data);
+        videoSamples = appendUint8Array(this.remainderData, data);
       }
-      const segmentedData = segmentValidRange(avcSamples);
+      const segmentedData = segmentValidRange(videoSamples);
       this.remainderData = segmentedData.remainder;
-      avcTrack.samples = segmentedData.valid || new Uint8Array();
+      videoTrack.samples = segmentedData.valid || new Uint8Array();
     } else {
-      avcTrack.samples = avcSamples;
+      videoTrack.samples = videoSamples;
     }
 
-    const id3Track = dummyTrack() as DemuxedMetadataTrack;
-    const emsgs = findBox(avcTrack.samples, ['emsg']);
-    if (emsgs) {
-      id3Track.inputTimeScale = 1;
-      emsgs.forEach(({ data, start, end }) => {
-        const emsgInfo = parseEmsg(data.subarray(start, end));
-        if (emsgSchemePattern.test(emsgInfo.schemeIdUri)) {
-          const pts = Number.isFinite(emsgInfo.presentationTime)
-            ? emsgInfo.presentationTime! / emsgInfo.timeScale
-            : timeOffset + emsgInfo.presentationTimeDelta! / emsgInfo.timeScale;
-          const payload = emsgInfo.payload;
-          id3Track.samples.push({
-            data: payload,
-            len: payload.byteLength,
-            dts: pts,
-            pts: pts,
-          });
-        }
-      });
-    }
+    const id3Track = this.extractID3Track(videoTrack, timeOffset);
+    textTrack.samples = parseSamples(timeOffset, videoTrack);
 
     return {
-      audioTrack: dummyTrack() as DemuxedAudioTrack,
-      avcTrack,
+      videoTrack,
+      audioTrack: this.audioTrack as DemuxedAudioTrack,
       id3Track,
-      textTrack: dummyTrack() as DemuxedUserdataTrack,
+      textTrack: this.txtTrack as DemuxedUserdataTrack,
     };
   }
 
-  flush() {
-    const avcTrack = dummyTrack() as PassthroughVideoTrack;
-    avcTrack.samples = this.remainderData || new Uint8Array();
+  public flush() {
+    const timeOffset = this.timeOffset;
+    const videoTrack = this.videoTrack as PassthroughTrack;
+    const textTrack = this.txtTrack as DemuxedUserdataTrack;
+    videoTrack.samples = this.remainderData || new Uint8Array();
     this.remainderData = null;
 
+    const id3Track = this.extractID3Track(videoTrack, this.timeOffset);
+    textTrack.samples = parseSamples(timeOffset, videoTrack);
+
     return {
+      videoTrack,
       audioTrack: dummyTrack() as DemuxedAudioTrack,
-      avcTrack,
-      id3Track: dummyTrack() as DemuxedMetadataTrack,
+      id3Track,
       textTrack: dummyTrack() as DemuxedUserdataTrack,
     };
   }
 
+  private extractID3Track(
+    videoTrack: PassthroughTrack,
+    timeOffset: number
+  ): DemuxedMetadataTrack {
+    const id3Track = this.id3Track as DemuxedMetadataTrack;
+    if (videoTrack.samples.length) {
+      const emsgs = findBox(videoTrack.samples, ['emsg']);
+      if (emsgs) {
+        emsgs.forEach((data: Uint8Array) => {
+          const emsgInfo = parseEmsg(data);
+          if (emsgSchemePattern.test(emsgInfo.schemeIdUri)) {
+            const pts = Number.isFinite(emsgInfo.presentationTime)
+              ? emsgInfo.presentationTime! / emsgInfo.timeScale
+              : timeOffset +
+                emsgInfo.presentationTimeDelta! / emsgInfo.timeScale;
+            const payload = emsgInfo.payload;
+            id3Track.samples.push({
+              data: payload,
+              len: payload.byteLength,
+              dts: pts,
+              pts: pts,
+            });
+          }
+        });
+      }
+    }
+    return id3Track;
+  }
+
   demuxSampleAes(
     data: Uint8Array,
     keyData: KeyData,