@@ -26,6 +26,20 @@ int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
26
26
std::round (seconds * timeBase.den / timeBase.num ));
27
27
}
28
28
29
+ // Some videos aren't properly encoded and do not specify pts values for
30
+ // packets, and thus for frames. Unset values correspond to INT64_MIN. When that
31
+ // happens, we fallback to the dts value which hopefully exists and is correct.
32
+ // Accessing AVFrames and AVPackets's pts values should **always** go through
33
+ // the helpers below. Then, the "pts" fields in our structs like FrameInfo.pts
34
+ // should be interpreted as "pts if it exists, dts otherwise".
35
+ int64_t getPtsOrDts (ReferenceAVPacket& packet) {
36
+ return packet->pts == INT64_MIN ? packet->dts : packet->pts ;
37
+ }
38
+
39
+ int64_t getPtsOrDts (const UniqueAVFrame& avFrame) {
40
+ return avFrame->pts == INT64_MIN ? avFrame->pkt_dts : avFrame->pts ;
41
+ }
42
+
29
43
} // namespace
30
44
31
45
// --------------------------------------------------------------------------
@@ -223,16 +237,16 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
223
237
int streamIndex = packet->stream_index ;
224
238
auto & streamMetadata = containerMetadata_.allStreamMetadata [streamIndex];
225
239
streamMetadata.minPtsFromScan = std::min (
226
- streamMetadata.minPtsFromScan .value_or (INT64_MAX), packet-> pts );
240
+ streamMetadata.minPtsFromScan .value_or (INT64_MAX), getPtsOrDts ( packet) );
227
241
streamMetadata.maxPtsFromScan = std::max (
228
242
streamMetadata.maxPtsFromScan .value_or (INT64_MIN),
229
- packet-> pts + packet->duration );
243
+ getPtsOrDts ( packet) + packet->duration );
230
244
streamMetadata.numFramesFromScan =
231
245
streamMetadata.numFramesFromScan .value_or (0 ) + 1 ;
232
246
233
247
// Note that we set the other value in this struct, nextPts, only after
234
248
// we have scanned all packets and sorted by pts.
235
- FrameInfo frameInfo = {packet-> pts };
249
+ FrameInfo frameInfo = {getPtsOrDts ( packet) };
236
250
if (packet->flags & AV_PKT_FLAG_KEY) {
237
251
frameInfo.isKeyFrame = true ;
238
252
streamInfos_[streamIndex].keyFrames .push_back (frameInfo);
@@ -493,8 +507,9 @@ FrameOutput SingleStreamDecoder::getNextFrame() {
493
507
FrameOutput SingleStreamDecoder::getNextFrameInternal (
494
508
std::optional<torch::Tensor> preAllocatedOutputTensor) {
495
509
validateActiveStream ();
496
- UniqueAVFrame avFrame = decodeAVFrame (
497
- [this ](const UniqueAVFrame& avFrame) { return avFrame->pts >= cursor_; });
510
+ UniqueAVFrame avFrame = decodeAVFrame ([this ](const UniqueAVFrame& avFrame) {
511
+ return getPtsOrDts (avFrame) >= cursor_;
512
+ });
498
513
return convertAVFrameToFrameOutput (avFrame, preAllocatedOutputTensor);
499
514
}
500
515
@@ -630,9 +645,10 @@ FrameOutput SingleStreamDecoder::getFramePlayedAt(double seconds) {
630
645
UniqueAVFrame avFrame =
631
646
decodeAVFrame ([seconds, this ](const UniqueAVFrame& avFrame) {
632
647
StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
633
- double frameStartTime = ptsToSeconds (avFrame->pts , streamInfo.timeBase );
648
+ double frameStartTime =
649
+ ptsToSeconds (getPtsOrDts (avFrame), streamInfo.timeBase );
634
650
double frameEndTime = ptsToSeconds (
635
- avFrame-> pts + getDuration (avFrame), streamInfo.timeBase );
651
+ getPtsOrDts ( avFrame) + getDuration (avFrame), streamInfo.timeBase );
636
652
if (frameStartTime > seconds) {
637
653
// FFMPEG seeked past the frame we are looking for even though we
638
654
// set max_ts to be our needed timestamp in avformat_seek_file()
@@ -859,8 +875,8 @@ AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
859
875
try {
860
876
UniqueAVFrame avFrame =
861
877
decodeAVFrame ([startPts, stopPts](const UniqueAVFrame& avFrame) {
862
- return startPts < avFrame-> pts + getDuration (avFrame) &&
863
- stopPts > avFrame-> pts ;
878
+ return startPts < getPtsOrDts ( avFrame) + getDuration (avFrame) &&
879
+ stopPts > getPtsOrDts ( avFrame) ;
864
880
});
865
881
auto frameOutput = convertAVFrameToFrameOutput (avFrame);
866
882
if (!firstFramePtsSeconds.has_value ()) {
@@ -1130,7 +1146,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
1130
1146
// haven't received as frames. Eventually we will either hit AVERROR_EOF from
1131
1147
// av_receive_frame() or the user will have seeked to a different location in
1132
1148
// the file and that will flush the decoder.
1133
- streamInfo.lastDecodedAvFramePts = avFrame-> pts ;
1149
+ streamInfo.lastDecodedAvFramePts = getPtsOrDts ( avFrame) ;
1134
1150
streamInfo.lastDecodedAvFrameDuration = getDuration (avFrame);
1135
1151
1136
1152
return avFrame;
@@ -1147,7 +1163,8 @@ FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
1147
1163
FrameOutput frameOutput;
1148
1164
auto & streamInfo = streamInfos_[activeStreamIndex_];
1149
1165
frameOutput.ptsSeconds = ptsToSeconds (
1150
- avFrame->pts , formatContext_->streams [activeStreamIndex_]->time_base );
1166
+ getPtsOrDts (avFrame),
1167
+ formatContext_->streams [activeStreamIndex_]->time_base );
1151
1168
frameOutput.durationSeconds = ptsToSeconds (
1152
1169
getDuration (avFrame),
1153
1170
formatContext_->streams [activeStreamIndex_]->time_base );
0 commit comments