Ensure we don't read invalid memory when seeking to pts before the first frame (pytorch#331)

ahmadsharif1 · web-flow · commit b979201eaaae · 2024-11-04T15:29:06.000-05:00
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -34,6 +34,10 @@ double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
   return ptsToSeconds(pts, timeBase.den);
 }
 
+int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
+  return static_cast<int64_t>(std::round(seconds * timeBase.den));
+}
+
 struct AVInput {
   UniqueAVFormatContext formatContext;
   std::unique_ptr<AVIOBytesContext> ioBytesContext;
@@ -663,7 +667,7 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   for (int streamIndex : activeStreamIndices_) {
     StreamInfo& streamInfo = streams_[streamIndex];
     // clang-format off: clang format clashes
-    streamInfo.discardFramesBeforePts = *maybeDesiredPts_ * streamInfo.timeBase.den;
+    streamInfo.discardFramesBeforePts = secondsToClosestPts(*maybeDesiredPts_, streamInfo.timeBase);
     // clang-format on
   }
 
@@ -686,16 +690,18 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   }
   int firstActiveStreamIndex = *activeStreamIndices_.begin();
   const auto& firstStreamInfo = streams_[firstActiveStreamIndex];
-  int64_t desiredPts = *maybeDesiredPts_ * firstStreamInfo.timeBase.den;
+  int64_t desiredPts =
+      secondsToClosestPts(*maybeDesiredPts_, firstStreamInfo.timeBase);
 
   // For some encodings like H265, FFMPEG sometimes seeks past the point we
   // set as the max_ts. So we use our own index to give it the exact pts of
   // the key frame that we want to seek to.
   // See https://github.com/pytorch/torchcodec/issues/179 for more details.
   // See https://trac.ffmpeg.org/ticket/11137 for the underlying ffmpeg bug.
   if (!firstStreamInfo.keyFrames.empty()) {
-    int desiredKeyFrameIndex =
-        getKeyFrameIndexForPts(firstStreamInfo, desiredPts);
+    int desiredKeyFrameIndex = getKeyFrameIndexForPtsUsingScannedIndex(
+        firstStreamInfo.keyFrames, desiredPts);
+    desiredKeyFrameIndex = std::max(desiredKeyFrameIndex, 0);
     desiredPts = firstStreamInfo.keyFrames[desiredKeyFrameIndex].pts;
   }
 
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -84,6 +84,20 @@ def test_seek_and_next(self, device):
         )
         frame_compare_function(frame_time6, reference_frame_time6.to(device))
 
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_seek_to_negative_pts(self, device):
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        scan_all_streams_to_update_metadata(decoder)
+        add_video_stream(decoder, device=device)
+        frame_compare_function = get_frame_compare_function(device)
+        frame0, _, _ = get_next_frame(decoder)
+        reference_frame0 = NASA_VIDEO.get_frame_data_by_index(0)
+        frame_compare_function(frame0, reference_frame0.to(device))
+
+        seek_to_pts(decoder, -1e-4)
+        frame0, _, _ = get_next_frame(decoder)
+        frame_compare_function(frame0, reference_frame0.to(device))
+
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_get_frame_at_pts(self, device):
         decoder = create_from_file(str(NASA_VIDEO.path))