scuffle_transmuxer/
lib.rs

1#![allow(clippy::single_match)]
2
3use std::borrow::Cow;
4use std::collections::{HashMap, VecDeque};
5use std::fmt::Debug;
6use std::io;
7
8use byteorder::{BigEndian, ReadBytesExt};
9use bytes::{Buf, Bytes};
10use scuffle_amf0::Amf0Value;
11use scuffle_flv::aac::AacPacket;
12use scuffle_flv::audio::{AudioData, AudioDataBody, SoundType};
13use scuffle_flv::av1::Av1Packet;
14use scuffle_flv::avc::AvcPacket;
15use scuffle_flv::hevc::HevcPacket;
16use scuffle_flv::script::ScriptData;
17use scuffle_flv::tag::{FlvTag, FlvTagData};
18use scuffle_flv::video::{EnhancedPacket, FrameType, VideoTagBody, VideoTagHeader};
19use scuffle_mp4::codec::{AudioCodec, VideoCodec};
20use scuffle_mp4::types::ftyp::{FourCC, Ftyp};
21use scuffle_mp4::types::hdlr::{HandlerType, Hdlr};
22use scuffle_mp4::types::mdat::Mdat;
23use scuffle_mp4::types::mdhd::Mdhd;
24use scuffle_mp4::types::mdia::Mdia;
25use scuffle_mp4::types::mfhd::Mfhd;
26use scuffle_mp4::types::minf::Minf;
27use scuffle_mp4::types::moof::Moof;
28use scuffle_mp4::types::moov::Moov;
29use scuffle_mp4::types::mvex::Mvex;
30use scuffle_mp4::types::mvhd::Mvhd;
31use scuffle_mp4::types::smhd::Smhd;
32use scuffle_mp4::types::stbl::Stbl;
33use scuffle_mp4::types::stco::Stco;
34use scuffle_mp4::types::stsc::Stsc;
35use scuffle_mp4::types::stsd::Stsd;
36use scuffle_mp4::types::stsz::Stsz;
37use scuffle_mp4::types::stts::Stts;
38use scuffle_mp4::types::tfdt::Tfdt;
39use scuffle_mp4::types::tfhd::Tfhd;
40use scuffle_mp4::types::tkhd::Tkhd;
41use scuffle_mp4::types::traf::Traf;
42use scuffle_mp4::types::trak::Trak;
43use scuffle_mp4::types::trex::Trex;
44use scuffle_mp4::types::trun::Trun;
45use scuffle_mp4::types::vmhd::Vmhd;
46use scuffle_mp4::BoxType;
47
48mod codecs;
49mod define;
50mod errors;
51
52pub use define::*;
53pub use errors::TransmuxError;
54
55struct Tags {
56    video_sequence_header: Option<VideoSequenceHeader>,
57    audio_sequence_header: Option<AudioSequenceHeader>,
58    scriptdata_tag: Option<HashMap<Cow<'static, str>, Amf0Value<'static>>>,
59}
60
61#[derive(Debug, Clone)]
62pub struct Transmuxer {
63    // These durations are measured in timescales
64    /// sample_freq * 1000
65    audio_duration: u64,
66    /// fps * 1000
67    video_duration: u64,
68    sequence_number: u32,
69    last_video_timestamp: u32,
70    settings: Option<(VideoSettings, AudioSettings)>,
71    tags: VecDeque<FlvTag>,
72}
73
74impl Default for Transmuxer {
75    fn default() -> Self {
76        Self::new()
77    }
78}
79
80impl Transmuxer {
81    pub fn new() -> Self {
82        Self {
83            sequence_number: 1,
84            tags: VecDeque::new(),
85            audio_duration: 0,
86            video_duration: 0,
87            last_video_timestamp: 0,
88            settings: None,
89        }
90    }
91
92    /// Feed raw FLV data to the transmuxer.
93    pub fn demux(&mut self, data: Bytes) -> Result<(), TransmuxError> {
94        let mut cursor = io::Cursor::new(data);
95        while cursor.has_remaining() {
96            cursor.read_u32::<BigEndian>()?; // previous tag size
97            if !cursor.has_remaining() {
98                break;
99            }
100
101            let tag = FlvTag::demux(&mut cursor)?;
102            self.tags.push_back(tag);
103        }
104
105        Ok(())
106    }
107
108    /// Feed a single FLV tag to the transmuxer.
109    pub fn add_tag(&mut self, tag: FlvTag) {
110        self.tags.push_back(tag);
111    }
112
113    /// Get the next transmuxed packet. This will return `None` if there is not
114    /// enough data to create a packet.
115    pub fn mux(&mut self) -> Result<Option<TransmuxResult>, TransmuxError> {
116        let mut writer = Vec::new();
117
118        let Some((video_settings, _)) = &self.settings else {
119            let Some((video_settings, audio_settings)) = self.init_sequence(&mut writer)? else {
120                if self.tags.len() > 30 {
121                    // We are clearly not getting any sequence headers, so we should just give up
122                    return Err(TransmuxError::NoSequenceHeaders);
123                }
124
125                // We don't have enough tags to create an init segment yet
126                return Ok(None);
127            };
128
129            self.settings = Some((video_settings.clone(), audio_settings.clone()));
130
131            return Ok(Some(TransmuxResult::InitSegment {
132                data: Bytes::from(writer),
133                audio_settings,
134                video_settings,
135            }));
136        };
137
138        loop {
139            let Some(tag) = self.tags.pop_front() else {
140                return Ok(None);
141            };
142
143            let mdat_data;
144            let total_duration;
145            let trun_sample;
146            let mut is_audio = false;
147            let mut is_keyframe = false;
148
149            let duration =
150                if self.last_video_timestamp == 0 || tag.timestamp_ms == 0 || tag.timestamp_ms < self.last_video_timestamp {
151                    1000 // the first frame is always 1000 ticks where the
152                         // timescale is 1000 * fps.
153                } else {
154                    // Since the delta is in milliseconds (ie 1/1000 of a second)
155                    // Rounding errors happen. Our presision is only 1/1000 of a second.
156                    // So if we have a 30fps video the delta should be 33.33ms (1000/30)
157                    // But we can only represent this as 33ms or 34ms. So we will get rounding
158                    // errors. To fix this we just check if the delta is 1 more or 1 less than the
159                    // expected delta. And if it is we just use the expected delta.
160                    // The reason we use a timescale which is 1000 * fps is because then we can
161                    // always represent the delta as an integer. If we use a timescale of 1000, we
162                    // would run into the same rounding errors.
163                    let delta = tag.timestamp_ms as f64 - self.last_video_timestamp as f64;
164                    let expected_delta = 1000.0 / video_settings.framerate;
165                    if (delta - expected_delta).abs() <= 1.0 {
166                        1000
167                    } else {
168                        (delta * video_settings.framerate) as u32
169                    }
170                };
171
172            match tag.data {
173                FlvTagData::Audio(AudioData {
174                    body: AudioDataBody::Aac(AacPacket::Raw(data)),
175                    ..
176                }) => {
177                    let (sample, duration) = codecs::aac::trun_sample(&data)?;
178
179                    trun_sample = sample;
180                    mdat_data = data;
181                    total_duration = duration;
182                    is_audio = true;
183                }
184                FlvTagData::Video(VideoTagHeader {
185                    frame_type,
186                    body: VideoTagBody::Avc(AvcPacket::Nalu { composition_time, data }),
187                    ..
188                }) => {
189                    let composition_time = ((composition_time as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
190
191                    let sample = codecs::avc::trun_sample(frame_type, composition_time as u32, duration, &data)?;
192
193                    trun_sample = sample;
194                    total_duration = duration;
195                    mdat_data = data;
196
197                    is_keyframe = frame_type == FrameType::Keyframe;
198                }
199                FlvTagData::Video(VideoTagHeader {
200                    frame_type,
201                    body: VideoTagBody::Enhanced(EnhancedPacket::Av1(Av1Packet::Raw(data))),
202                    ..
203                }) => {
204                    let sample = codecs::av1::trun_sample(frame_type, duration, &data)?;
205
206                    trun_sample = sample;
207                    total_duration = duration;
208                    mdat_data = data;
209
210                    is_keyframe = frame_type == FrameType::Keyframe;
211                }
212                FlvTagData::Video(VideoTagHeader {
213                    frame_type,
214                    body: VideoTagBody::Enhanced(EnhancedPacket::Hevc(HevcPacket::Nalu { composition_time, data })),
215                    ..
216                }) => {
217                    let composition_time =
218                        ((composition_time.unwrap_or_default() as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
219
220                    let sample = codecs::hevc::trun_sample(frame_type, composition_time as i32, duration, &data)?;
221
222                    trun_sample = sample;
223                    total_duration = duration;
224                    mdat_data = data;
225
226                    is_keyframe = frame_type == FrameType::Keyframe;
227                }
228                _ => {
229                    // We don't support anything else
230                    continue;
231                }
232            }
233
234            let trafs = {
235                let (main_duration, main_id) = if is_audio {
236                    (self.audio_duration, 2)
237                } else {
238                    (self.video_duration, 1)
239                };
240
241                let mut traf = Traf::new(
242                    Tfhd::new(main_id, None, None, None, None, None),
243                    Some(Trun::new(vec![trun_sample], None)),
244                    Some(Tfdt::new(main_duration)),
245                );
246                traf.optimize();
247
248                vec![traf]
249            };
250
251            let mut moof = Moof::new(Mfhd::new(self.sequence_number), trafs);
252
253            // We need to get the moof size so that we can set the data offsets.
254            let moof_size = moof.size();
255
256            // We just created the moof, and therefore we know that the first traf is the
257            // video traf and the second traf is the audio traf. So we can just unwrap them
258            // and set the data offsets.
259            let traf = moof.traf.get_mut(0).expect("we just created the moof with a traf");
260
261            // Again we know that these exist because we just created it.
262            let trun = traf.trun.as_mut().expect("we just created the video traf with a trun");
263
264            // We now define the offsets.
265            // So the video offset will be the size of the moof + 8 bytes for the mdat
266            // header.
267            trun.data_offset = Some(moof_size as i32 + 8);
268
269            // We then write the moof to the writer.
270            moof.mux(&mut writer)?;
271
272            // We create an mdat box and write it to the writer.
273            Mdat::new(vec![mdat_data]).mux(&mut writer)?;
274
275            // Increase our sequence number and duration.
276            self.sequence_number += 1;
277
278            if is_audio {
279                self.audio_duration += total_duration as u64;
280                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
281                    data: Bytes::from(writer),
282                    ty: MediaType::Audio,
283                    keyframe: false,
284                    timestamp: self.audio_duration - total_duration as u64,
285                })));
286            } else {
287                self.video_duration += total_duration as u64;
288                self.last_video_timestamp = tag.timestamp_ms;
289                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
290                    data: Bytes::from(writer),
291                    ty: MediaType::Video,
292                    keyframe: is_keyframe,
293                    timestamp: self.video_duration - total_duration as u64,
294                })));
295            }
296        }
297    }
298
299    /// Internal function to find the tags we need to create the init segment.
300    fn find_tags(&self) -> Tags {
301        let tags = self.tags.iter();
302        let mut video_sequence_header = None;
303        let mut audio_sequence_header = None;
304        let mut scriptdata_tag = None;
305
306        for tag in tags {
307            if video_sequence_header.is_some() && audio_sequence_header.is_some() && scriptdata_tag.is_some() {
308                break;
309            }
310
311            match &tag.data {
312                FlvTagData::Video(VideoTagHeader {
313                    frame_type: _,
314                    body: VideoTagBody::Avc(AvcPacket::SequenceHeader(data)),
315                    ..
316                }) => {
317                    video_sequence_header = Some(VideoSequenceHeader::Avc(data.clone()));
318                }
319                FlvTagData::Video(VideoTagHeader {
320                    frame_type: _,
321                    body: VideoTagBody::Enhanced(EnhancedPacket::Av1(Av1Packet::SequenceStart(config))),
322                    ..
323                }) => {
324                    video_sequence_header = Some(VideoSequenceHeader::Av1(config.clone()));
325                }
326                FlvTagData::Video(VideoTagHeader {
327                    frame_type: _,
328                    body: VideoTagBody::Enhanced(EnhancedPacket::Hevc(HevcPacket::SequenceStart(config))),
329                    ..
330                }) => {
331                    video_sequence_header = Some(VideoSequenceHeader::Hevc(config.clone()));
332                }
333                FlvTagData::Audio(AudioData {
334                    body: AudioDataBody::Aac(AacPacket::SequenceHeader(data)),
335                    sound_size,
336                    sound_type,
337                    ..
338                }) => {
339                    audio_sequence_header = Some(AudioSequenceHeader {
340                        data: AudioSequenceHeaderData::Aac(data.clone()),
341                        sound_size: *sound_size,
342                        sound_type: *sound_type,
343                    });
344                }
345                FlvTagData::ScriptData(ScriptData { data, name }) => {
346                    if name == "@setDataFrame" || name == "onMetaData" {
347                        let meta_object = data.iter().find(|v| matches!(v, Amf0Value::Object(_)));
348
349                        if let Some(Amf0Value::Object(meta_object)) = meta_object {
350                            scriptdata_tag = Some(meta_object.iter().map(|(k, v)| (k.clone(), v.clone())).collect());
351                        }
352                    }
353                }
354                _ => {}
355            }
356        }
357
358        Tags {
359            video_sequence_header,
360            audio_sequence_header,
361            scriptdata_tag,
362        }
363    }
364
365    /// Create the init segment.
366    fn init_sequence(
367        &mut self,
368        writer: &mut impl io::Write,
369    ) -> Result<Option<(VideoSettings, AudioSettings)>, TransmuxError> {
370        // We need to find the tag that is the video sequence header
371        // and the audio sequence header
372        let Tags {
373            video_sequence_header,
374            audio_sequence_header,
375            scriptdata_tag,
376        } = self.find_tags();
377
378        let Some(video_sequence_header) = video_sequence_header else {
379            return Ok(None);
380        };
381        let Some(audio_sequence_header) = audio_sequence_header else {
382            return Ok(None);
383        };
384
385        let video_codec;
386        let audio_codec;
387        let video_width;
388        let video_height;
389        let audio_channels;
390        let audio_sample_rate;
391        let mut video_fps = 0.0;
392
393        let mut estimated_video_bitrate = 0;
394        let mut estimated_audio_bitrate = 0;
395
396        if let Some(scriptdata_tag) = scriptdata_tag {
397            video_fps = scriptdata_tag
398                .get("framerate")
399                .and_then(|v| match v {
400                    Amf0Value::Number(v) => Some(*v),
401                    _ => None,
402                })
403                .unwrap_or(0.0);
404
405            estimated_video_bitrate = scriptdata_tag
406                .get("videodatarate")
407                .and_then(|v| match v {
408                    Amf0Value::Number(v) => Some((*v * 1024.0) as u32),
409                    _ => None,
410                })
411                .unwrap_or(0);
412
413            estimated_audio_bitrate = scriptdata_tag
414                .get("audiodatarate")
415                .and_then(|v| match v {
416                    Amf0Value::Number(v) => Some((*v * 1024.0) as u32),
417                    _ => None,
418                })
419                .unwrap_or(0);
420        }
421
422        let mut compatiable_brands = vec![FourCC::Iso5, FourCC::Iso6];
423
424        let video_stsd_entry = match video_sequence_header {
425            VideoSequenceHeader::Avc(config) => {
426                compatiable_brands.push(FourCC::Avc1);
427                video_codec = VideoCodec::Avc {
428                    constraint_set: config.profile_compatibility,
429                    level: config.level_indication,
430                    profile: config.profile_indication,
431                };
432
433                let (entry, sps) = codecs::avc::stsd_entry(config)?;
434                if sps.frame_rate != 0.0 {
435                    video_fps = sps.frame_rate;
436                }
437
438                video_width = sps.width as u32;
439                video_height = sps.height as u32;
440
441                entry
442            }
443            VideoSequenceHeader::Av1(config) => {
444                compatiable_brands.push(FourCC::Av01);
445                let (entry, seq_obu) = codecs::av1::stsd_entry(config)?;
446
447                video_height = seq_obu.max_frame_height as u32;
448                video_width = seq_obu.max_frame_width as u32;
449
450                let op_point = &seq_obu.operating_points[0];
451
452                video_codec = VideoCodec::Av1 {
453                    profile: seq_obu.seq_profile,
454                    level: op_point.seq_level_idx,
455                    tier: op_point.seq_tier,
456                    depth: seq_obu.color_config.bit_depth as u8,
457                    monochrome: seq_obu.color_config.mono_chrome,
458                    sub_sampling_x: seq_obu.color_config.subsampling_x,
459                    sub_sampling_y: seq_obu.color_config.subsampling_y,
460                    color_primaries: seq_obu.color_config.color_primaries,
461                    transfer_characteristics: seq_obu.color_config.transfer_characteristics,
462                    matrix_coefficients: seq_obu.color_config.matrix_coefficients,
463                    full_range_flag: seq_obu.color_config.full_color_range,
464                };
465
466                entry
467            }
468            VideoSequenceHeader::Hevc(config) => {
469                compatiable_brands.push(FourCC::Hev1);
470                video_codec = VideoCodec::Hevc {
471                    constraint_indicator: config.general_constraint_indicator_flags,
472                    level: config.general_level_idc,
473                    profile: config.general_profile_idc,
474                    profile_compatibility: config.general_profile_compatibility_flags,
475                    tier: config.general_tier_flag,
476                    general_profile_space: config.general_profile_space,
477                };
478
479                let (entry, sps) = codecs::hevc::stsd_entry(config)?;
480                if sps.frame_rate != 0.0 {
481                    video_fps = sps.frame_rate;
482                }
483
484                video_width = sps.width as u32;
485                video_height = sps.height as u32;
486
487                entry
488            }
489        };
490
491        let audio_stsd_entry = match audio_sequence_header.data {
492            AudioSequenceHeaderData::Aac(data) => {
493                compatiable_brands.push(FourCC::Mp41);
494                let (entry, config) =
495                    codecs::aac::stsd_entry(audio_sequence_header.sound_size, audio_sequence_header.sound_type, data)?;
496
497                audio_sample_rate = config.sampling_frequency;
498
499                audio_codec = AudioCodec::Aac {
500                    object_type: config.audio_object_type,
501                };
502                audio_channels = match audio_sequence_header.sound_type {
503                    SoundType::Mono => 1,
504                    SoundType::Stereo => 2,
505                    _ => return Err(TransmuxError::InvalidAudioChannels),
506                };
507
508                entry
509            }
510        };
511
512        if video_fps == 0.0 {
513            return Err(TransmuxError::InvalidVideoFrameRate);
514        }
515
516        if video_width == 0 || video_height == 0 {
517            return Err(TransmuxError::InvalidVideoDimensions);
518        }
519
520        if audio_sample_rate == 0 {
521            return Err(TransmuxError::InvalidAudioSampleRate);
522        }
523
524        // The reason we multiply the FPS by 1000 is to avoid rounding errors
525        // Consider If we had a video with a framerate of 30fps. That would imply each
526        // frame is 33.333333ms So we are limited to a u32 and therefore we could only
527        // represent 33.333333ms as 33ms. So this value is 30 * 1000 = 30000 timescale
528        // units per second, making each frame 1000 units long instead of 33ms long.
529        let video_timescale = (1000.0 * video_fps) as u32;
530
531        Ftyp::new(FourCC::Iso5, 512, compatiable_brands).mux(writer)?;
532        Moov::new(
533            Mvhd::new(0, 0, 1000, 0, 1),
534            vec![
535                Trak::new(
536                    Tkhd::new(0, 0, 1, 0, Some((video_width, video_height))),
537                    None,
538                    Mdia::new(
539                        Mdhd::new(0, 0, video_timescale, 0),
540                        Hdlr::new(HandlerType::Vide, "VideoHandler".to_string()),
541                        Minf::new(
542                            Stbl::new(
543                                Stsd::new(vec![video_stsd_entry]),
544                                Stts::new(vec![]),
545                                Stsc::new(vec![]),
546                                Stco::new(vec![]),
547                                Some(Stsz::new(0, vec![])),
548                            ),
549                            Some(Vmhd::new()),
550                            None,
551                        ),
552                    ),
553                ),
554                Trak::new(
555                    Tkhd::new(0, 0, 2, 0, None),
556                    None,
557                    Mdia::new(
558                        Mdhd::new(0, 0, audio_sample_rate, 0),
559                        Hdlr::new(HandlerType::Soun, "SoundHandler".to_string()),
560                        Minf::new(
561                            Stbl::new(
562                                Stsd::new(vec![audio_stsd_entry]),
563                                Stts::new(vec![]),
564                                Stsc::new(vec![]),
565                                Stco::new(vec![]),
566                                Some(Stsz::new(0, vec![])),
567                            ),
568                            None,
569                            Some(Smhd::new()),
570                        ),
571                    ),
572                ),
573            ],
574            Some(Mvex::new(vec![Trex::new(1), Trex::new(2)], None)),
575        )
576        .mux(writer)?;
577
578        Ok(Some((
579            VideoSettings {
580                width: video_width,
581                height: video_height,
582                framerate: video_fps,
583                codec: video_codec,
584                bitrate: estimated_video_bitrate,
585                timescale: video_timescale,
586            },
587            AudioSettings {
588                codec: audio_codec,
589                sample_rate: audio_sample_rate,
590                channels: audio_channels,
591                bitrate: estimated_audio_bitrate,
592                timescale: audio_sample_rate,
593            },
594        )))
595    }
596}
597
598#[cfg(test)]
599mod tests;