From f6a4973578c4692f33283aee1c089afd0cdae508 Mon Sep 17 00:00:00 2001
From: Zaggy1024 <Zaggy1024@gmail.com>
Date: Wed, 19 Jun 2024 18:29:12 -0500
Subject: [PATCH] LibMedia: Give frame timestamps to FFmpeg decoders

H.264 in Matroska can have blocks with unordered timestamps. Without
passing these as the presentation timestamp into the FFmpeg decoder,
the frames will not be returned in chronological order.

VideoFrame will now include a timestamp that is used by the
PlaybackManager, rather than assuming that it is the same timestamp
returned by the demuxer.
---
 Meta/Lagom/Fuzzers/FuzzVP9Decoder.cpp             |  2 +-
 Tests/LibMedia/TestMediaCommon.h                  |  2 +-
 Tests/LibMedia/TestVP9Decode.cpp                  |  2 +-
 .../LibMedia/FFmpeg/FFmpegVideoDecoder.cpp        |  7 +++++--
 .../LibMedia/FFmpeg/FFmpegVideoDecoder.h          |  2 +-
 Userland/Libraries/LibMedia/PlaybackManager.cpp   |  6 +++---
 Userland/Libraries/LibMedia/Video/VP9/Decoder.cpp | 15 ++++++++-------
 Userland/Libraries/LibMedia/Video/VP9/Decoder.h   |  6 +++---
 Userland/Libraries/LibMedia/VideoDecoder.h        |  5 +++--
 Userland/Libraries/LibMedia/VideoFrame.cpp        |  6 ++++--
 Userland/Libraries/LibMedia/VideoFrame.h          | 15 ++++++++++++---
 11 files changed, 42 insertions(+), 26 deletions(-)
diff --git a/Meta/Lagom/Fuzzers/FuzzVP9Decoder.cpp b/Meta/Lagom/Fuzzers/FuzzVP9Decoder.cpp
index 69fe35acdb1..b2e0512e900 100644
--- a/Meta/Lagom/Fuzzers/FuzzVP9Decoder.cpp
+++ b/Meta/Lagom/Fuzzers/FuzzVP9Decoder.cpp
@@ -11,6 +11,6 @@ extern "C" int LLVMFuzzerTestOneInput(u8 const* data, size_t size)
 {
     AK::set_debug_enabled(false);
     Media::Video::VP9::Decoder vp9_decoder;
-    (void)vp9_decoder.receive_sample({ data, size });
+    (void)vp9_decoder.receive_sample(Duration::zero(), { data, size });
     return 0;
 }
diff --git a/Tests/LibMedia/TestMediaCommon.h b/Tests/LibMedia/TestMediaCommon.h
index 0b57e7388fd..92be4649fcb 100644
--- a/Tests/LibMedia/TestMediaCommon.h
+++ b/Tests/LibMedia/TestMediaCommon.h
@@ -37,7 +37,7 @@ static inline void decode_video(StringView path, size_t expected_frame_count, T
 
         auto block = block_result.release_value();
         for (auto const& frame : block.frames()) {
-            MUST(decoder->receive_sample(frame));
+            MUST(decoder->receive_sample(block.timestamp(), frame));
             while (true) {
                 auto frame_result = decoder->get_decoded_frame();
                 if (frame_result.is_error()) {
diff --git a/Tests/LibMedia/TestVP9Decode.cpp b/Tests/LibMedia/TestVP9Decode.cpp
index aaeafa8693c..c0bbd316ce4 100644
--- a/Tests/LibMedia/TestVP9Decode.cpp
+++ b/Tests/LibMedia/TestVP9Decode.cpp
@@ -35,7 +35,7 @@ TEST_CASE(vp9_malformed_frame)
     for (auto test_input : test_inputs) {
         auto file = MUST(Core::MappedFile::map(test_input));
         Media::Video::VP9::Decoder vp9_decoder;
-        auto maybe_decoder_error = vp9_decoder.receive_sample(file->bytes());
+        auto maybe_decoder_error = vp9_decoder.receive_sample(Duration::zero(), file->bytes());
         EXPECT(maybe_decoder_error.is_error());
     }
 }
diff --git a/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.cpp b/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.cpp
index f3f5af64c5a..cc9020f74d1 100644
--- a/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.cpp
+++ b/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.cpp
@@ -101,12 +101,14 @@ FFmpegVideoDecoder::~FFmpegVideoDecoder()
     avcodec_free_context(&m_codec_context);
 }
 
-DecoderErrorOr<void> FFmpegVideoDecoder::receive_sample(ReadonlyBytes sample)
+DecoderErrorOr<void> FFmpegVideoDecoder::receive_sample(Duration timestamp, ReadonlyBytes sample)
 {
     VERIFY(sample.size() < NumericLimits<int>::max());
 
     m_packet->data = const_cast<u8*>(sample.data());
     m_packet->size = static_cast<int>(sample.size());
+    m_packet->pts = timestamp.to_microseconds();
+    m_packet->dts = m_packet->pts;
 
     auto result = avcodec_send_packet(m_codec_context, m_packet);
     switch (result) {
@@ -187,7 +189,8 @@ DecoderErrorOr<NonnullOwnPtr<VideoFrame>> FFmpegVideoDecoder::get_decoded_frame(
 
         auto size = Gfx::Size<u32> { m_frame->width, m_frame->height };
 
-        auto frame = DECODER_TRY_ALLOC(SubsampledYUVFrame::try_create(size, bit_depth, cicp, subsampling));
+        auto timestamp = Duration::from_microseconds(m_frame->pts);
+        auto frame = DECODER_TRY_ALLOC(SubsampledYUVFrame::try_create(timestamp, size, bit_depth, cicp, subsampling));
 
         for (u32 plane = 0; plane < 3; plane++) {
             VERIFY(m_frame->linesize[plane] != 0);
diff --git a/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.h b/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.h
index 3ec165ba36a..7146fbcc683 100644
--- a/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.h
+++ b/Userland/Libraries/LibMedia/FFmpeg/FFmpegVideoDecoder.h
@@ -19,7 +19,7 @@ public:
     FFmpegVideoDecoder(AVCodecContext* codec_context, AVPacket* packet, AVFrame* frame);
     ~FFmpegVideoDecoder();
 
-    DecoderErrorOr<void> receive_sample(ReadonlyBytes sample) override;
+    DecoderErrorOr<void> receive_sample(Duration timestamp, ReadonlyBytes sample) override;
     DecoderErrorOr<NonnullOwnPtr<VideoFrame>> get_decoded_frame() override;
 
 private:
diff --git a/Userland/Libraries/LibMedia/PlaybackManager.cpp b/Userland/Libraries/LibMedia/PlaybackManager.cpp
index 579ddb416e8..cfb2996cdb8 100644
--- a/Userland/Libraries/LibMedia/PlaybackManager.cpp
+++ b/Userland/Libraries/LibMedia/PlaybackManager.cpp
@@ -215,7 +215,7 @@ void PlaybackManager::decode_and_queue_one_sample()
         auto sample = sample_result.release_value();
 
         // Submit the sample to the decoder.
-        auto decode_result = m_decoder->receive_sample(sample.data());
+        auto decode_result = m_decoder->receive_sample(sample.timestamp(), sample.data());
         if (decode_result.is_error()) {
             item_to_enqueue = FrameQueueItem::error_marker(decode_result.release_error(), sample.timestamp());
             break;
@@ -262,9 +262,9 @@ void PlaybackManager::decode_and_queue_one_sample()
             auto bitmap_result = decoded_frame->to_bitmap();
 
             if (bitmap_result.is_error())
-                item_to_enqueue = FrameQueueItem::error_marker(bitmap_result.release_error(), sample.timestamp());
+                item_to_enqueue = FrameQueueItem::error_marker(bitmap_result.release_error(), decoded_frame->timestamp());
             else
-                item_to_enqueue = FrameQueueItem::frame(bitmap_result.release_value(), sample.timestamp());
+                item_to_enqueue = FrameQueueItem::frame(bitmap_result.release_value(), decoded_frame->timestamp());
             break;
         }
     }
diff --git a/Userland/Libraries/LibMedia/Video/VP9/Decoder.cpp b/Userland/Libraries/LibMedia/Video/VP9/Decoder.cpp
index 5ecb233058a..a631d033aef 100644
--- a/Userland/Libraries/LibMedia/Video/VP9/Decoder.cpp
+++ b/Userland/Libraries/LibMedia/Video/VP9/Decoder.cpp
@@ -25,12 +25,12 @@ Decoder::Decoder()
 {
 }
 
-DecoderErrorOr<void> Decoder::receive_sample(ReadonlyBytes chunk_data)
+DecoderErrorOr<void> Decoder::receive_sample(Duration timestamp, ReadonlyBytes chunk_data)
 {
     auto superframe_sizes = m_parser->parse_superframe_sizes(chunk_data);
 
     if (superframe_sizes.is_empty()) {
-        return decode_frame(chunk_data);
+        return decode_frame(timestamp, chunk_data);
     }
 
     size_t offset = 0;
@@ -41,14 +41,14 @@ DecoderErrorOr<void> Decoder::receive_sample(ReadonlyBytes chunk_data)
         if (checked_size.has_overflow() || checked_size.value() > chunk_data.size())
             return DecoderError::with_description(DecoderErrorCategory::Corrupted, "Superframe size invalid"sv);
         auto frame_data = chunk_data.slice(offset, superframe_size);
-        TRY(decode_frame(frame_data));
+        TRY(decode_frame(timestamp, frame_data));
         offset = checked_size.value();
     }
 
     return {};
 }
 
-DecoderErrorOr<void> Decoder::decode_frame(ReadonlyBytes frame_data)
+DecoderErrorOr<void> Decoder::decode_frame(Duration timestamp, ReadonlyBytes frame_data)
 {
     // 1. The syntax elements for the coded frame are extracted as specified in sections 6 and 7. The syntax
     // tables include function calls indicating when the block decode processes should be triggered.
@@ -69,11 +69,11 @@ DecoderErrorOr<void> Decoder::decode_frame(ReadonlyBytes frame_data)
     if (frame_context.shows_a_frame()) {
         switch (frame_context.color_config.bit_depth) {
         case 8:
-            TRY(create_video_frame<u8>(frame_context));
+            TRY(create_video_frame<u8>(timestamp, frame_context));
             break;
         case 10:
         case 12:
-            TRY(create_video_frame<u16>(frame_context));
+            TRY(create_video_frame<u16>(timestamp, frame_context));
             break;
         }
     }
@@ -143,7 +143,7 @@ inline CodingIndependentCodePoints get_cicp_color_space(FrameContext const& fram
 }
 
 template<typename T>
-DecoderErrorOr<void> Decoder::create_video_frame(FrameContext const& frame_context)
+DecoderErrorOr<void> Decoder::create_video_frame(Duration timestamp, FrameContext const& frame_context)
 {
     // (8.9) Output process
 
@@ -162,6 +162,7 @@ DecoderErrorOr<void> Decoder::create_video_frame(FrameContext const& frame_conte
     auto output_uv_size = subsampling.subsampled_size(output_y_size);
 
     auto frame = DECODER_TRY_ALLOC(SubsampledYUVFrame::try_create(
+        timestamp,
         { output_y_size.width(), output_y_size.height() },
         frame_context.color_config.bit_depth, get_cicp_color_space(frame_context),
         subsampling));
diff --git a/Userland/Libraries/LibMedia/Video/VP9/Decoder.h b/Userland/Libraries/LibMedia/Video/VP9/Decoder.h
index 40927affb84..33dea7a131f 100644
--- a/Userland/Libraries/LibMedia/Video/VP9/Decoder.h
+++ b/Userland/Libraries/LibMedia/Video/VP9/Decoder.h
@@ -28,7 +28,7 @@ public:
     Decoder();
     ~Decoder() override { }
     /* (8.1) General */
-    DecoderErrorOr<void> receive_sample(ReadonlyBytes) override;
+    DecoderErrorOr<void> receive_sample(Duration timestamp, ReadonlyBytes) override;
 
     DecoderErrorOr<NonnullOwnPtr<VideoFrame>> get_decoded_frame() override;
 
@@ -41,9 +41,9 @@ private:
     // Based on the maximum for TXSize.
     static constexpr size_t maximum_transform_size = 32ULL * 32ULL;
 
-    DecoderErrorOr<void> decode_frame(ReadonlyBytes);
+    DecoderErrorOr<void> decode_frame(Duration timestamp, ReadonlyBytes);
     template<typename T>
-    DecoderErrorOr<void> create_video_frame(FrameContext const&);
+    DecoderErrorOr<void> create_video_frame(Duration timestamp, FrameContext const&);
 
     DecoderErrorOr<void> allocate_buffers(FrameContext const&);
     Vector<u16>& get_output_buffer(u8 plane);
diff --git a/Userland/Libraries/LibMedia/VideoDecoder.h b/Userland/Libraries/LibMedia/VideoDecoder.h
index 76659d7c400..0a219ecba19 100644
--- a/Userland/Libraries/LibMedia/VideoDecoder.h
+++ b/Userland/Libraries/LibMedia/VideoDecoder.h
@@ -8,6 +8,7 @@
 
 #include <AK/ByteBuffer.h>
 #include <AK/NonnullOwnPtr.h>
+#include <AK/Time.h>
 
 #include "DecoderError.h"
 
@@ -17,8 +18,8 @@ class VideoDecoder {
 public:
     virtual ~VideoDecoder() {};
 
-    virtual DecoderErrorOr<void> receive_sample(ReadonlyBytes sample) = 0;
-    DecoderErrorOr<void> receive_sample(ByteBuffer const& sample) { return receive_sample(sample.span()); }
+    virtual DecoderErrorOr<void> receive_sample(Duration timestamp, ReadonlyBytes sample) = 0;
+    DecoderErrorOr<void> receive_sample(Duration timestamp, ByteBuffer const& sample) { return receive_sample(timestamp, sample.span()); }
     virtual DecoderErrorOr<NonnullOwnPtr<VideoFrame>> get_decoded_frame() = 0;
 };
 
diff --git a/Userland/Libraries/LibMedia/VideoFrame.cpp b/Userland/Libraries/LibMedia/VideoFrame.cpp
index fd101d70005..a7d39b80c42 100644
--- a/Userland/Libraries/LibMedia/VideoFrame.cpp
+++ b/Userland/Libraries/LibMedia/VideoFrame.cpp
@@ -13,6 +13,7 @@
 namespace Media {
 
 ErrorOr<NonnullOwnPtr<SubsampledYUVFrame>> SubsampledYUVFrame::try_create(
+    Duration timestamp,
     Gfx::Size<u32> size,
     u8 bit_depth, CodingIndependentCodePoints cicp,
     Subsampling subsampling)
@@ -35,16 +36,17 @@ ErrorOr<NonnullOwnPtr<SubsampledYUVFrame>> SubsampledYUVFrame::try_create(
     auto* u_buffer = TRY(alloc_buffer(uv_data_size));
     auto* v_buffer = TRY(alloc_buffer(uv_data_size));
 
-    return adopt_nonnull_own_or_enomem(new (nothrow) SubsampledYUVFrame(size, bit_depth, cicp, subsampling, y_buffer, u_buffer, v_buffer));
+    return adopt_nonnull_own_or_enomem(new (nothrow) SubsampledYUVFrame(timestamp, size, bit_depth, cicp, subsampling, y_buffer, u_buffer, v_buffer));
 }
 
 ErrorOr<NonnullOwnPtr<SubsampledYUVFrame>> SubsampledYUVFrame::try_create_from_data(
+    Duration timestamp,
     Gfx::Size<u32> size,
     u8 bit_depth, CodingIndependentCodePoints cicp,
     Subsampling subsampling,
     ReadonlyBytes y_data, ReadonlyBytes u_data, ReadonlyBytes v_data)
 {
-    auto frame = TRY(try_create(size, bit_depth, cicp, subsampling));
+    auto frame = TRY(try_create(timestamp, size, bit_depth, cicp, subsampling));
 
     size_t component_size = bit_depth > 8 ? sizeof(u16) : sizeof(u8);
     auto y_data_size = size.to_type<size_t>().area() * component_size;
diff --git a/Userland/Libraries/LibMedia/VideoFrame.h b/Userland/Libraries/LibMedia/VideoFrame.h
index 4719b420792..0dd015b4431 100644
--- a/Userland/Libraries/LibMedia/VideoFrame.h
+++ b/Userland/Libraries/LibMedia/VideoFrame.h
@@ -8,6 +8,7 @@
 
 #include <AK/ByteBuffer.h>
 #include <AK/FixedArray.h>
+#include <AK/Time.h>
 #include <LibGfx/Bitmap.h>
 #include <LibGfx/Size.h>
 #include <LibMedia/Color/CodingIndependentCodePoints.h>
@@ -30,6 +31,8 @@ public:
         return bitmap;
     }
 
+    inline Duration timestamp() const { return m_timestamp; }
+
     inline Gfx::Size<u32> size() const { return m_size; }
     inline u32 width() const { return size().width(); }
     inline u32 height() const { return size().height(); }
@@ -38,14 +41,17 @@ public:
     inline CodingIndependentCodePoints& cicp() { return m_cicp; }
 
 protected:
-    VideoFrame(Gfx::Size<u32> size,
+    VideoFrame(Duration timestamp,
+        Gfx::Size<u32> size,
         u8 bit_depth, CodingIndependentCodePoints cicp)
-        : m_size(size)
+        : m_timestamp(timestamp)
+        , m_size(size)
         , m_bit_depth(bit_depth)
         , m_cicp(cicp)
     {
     }
 
+    Duration m_timestamp;
     Gfx::Size<u32> m_size;
     u8 m_bit_depth;
     CodingIndependentCodePoints m_cicp;
@@ -55,22 +61,25 @@ class SubsampledYUVFrame : public VideoFrame {
 
 public:
     static ErrorOr<NonnullOwnPtr<SubsampledYUVFrame>> try_create(
+        Duration timestamp,
         Gfx::Size<u32> size,
         u8 bit_depth, CodingIndependentCodePoints cicp,
         Subsampling subsampling);
 
     static ErrorOr<NonnullOwnPtr<SubsampledYUVFrame>> try_create_from_data(
+        Duration timestamp,
         Gfx::Size<u32> size,
         u8 bit_depth, CodingIndependentCodePoints cicp,
         Subsampling subsampling,
         ReadonlyBytes y_data, ReadonlyBytes u_data, ReadonlyBytes v_data);
 
     SubsampledYUVFrame(
+        Duration timestamp,
         Gfx::Size<u32> size,
         u8 bit_depth, CodingIndependentCodePoints cicp,
         Subsampling subsampling,
         u8* plane_y_data, u8* plane_u_data, u8* plane_v_data)
-        : VideoFrame(size, bit_depth, cicp)
+        : VideoFrame(timestamp, size, bit_depth, cicp)
         , m_subsampling(subsampling)
         , m_y_buffer(plane_y_data)
         , m_u_buffer(plane_u_data)