|
7 | 7 | #pragma once
|
8 | 8 |
|
9 | 9 | #include <torch/types.h>
|
| 10 | +#include "src/torchcodec/_core/FFMPEGCommon.h" |
10 | 11 | #include "src/torchcodec/_core/Metadata.h"
|
11 | 12 | #include "src/torchcodec/_core/StreamOptions.h"
|
12 | 13 |
|
@@ -44,4 +45,74 @@ struct AudioFramesOutput {
|
44 | 45 | double ptsSeconds;
|
45 | 46 | };
|
46 | 47 |
|
| 48 | +// -------------------------------------------------------------------------- |
| 49 | +// FRAME TENSOR ALLOCATION APIs |
| 50 | +// -------------------------------------------------------------------------- |
| 51 | + |
| 52 | +// Note [Frame Tensor allocation and height and width] |
| 53 | +// |
| 54 | +// We always allocate [N]HWC tensors. The low-level decoding functions all |
| 55 | +// assume HWC tensors, since this is what FFmpeg natively handles. It's up to |
| 56 | +// the high-level decoding entry-points to permute that back to CHW, by calling |
| 57 | +// maybePermuteHWC2CHW(). |
| 58 | +// |
| 59 | +// Also, importantly, the way we figure out the the height and width of the |
| 60 | +// output frame tensor varies, and depends on the decoding entry-point. In |
| 61 | +// *decreasing order of accuracy*, we use the following sources for determining |
| 62 | +// height and width: |
| 63 | +// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the |
| 64 | +// AVframe, *post*-resizing. This is only used for single-frame decoding APIs, |
| 65 | +// on CPU, with filtergraph. |
| 66 | +// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from |
| 67 | +// the user-specified options if they exist, or the height and width of the |
| 68 | +// AVFrame *before* it is resized. In theory, i.e. if there are no bugs within |
| 69 | +// our code or within FFmpeg code, this should be exactly the same as |
| 70 | +// getHeightAndWidthFromResizedAVFrame(). This is used by single-frame |
| 71 | +// decoding APIs, on CPU with swscale, and on GPU. |
| 72 | +// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from |
| 73 | +// the user-specified options if they exist, or the height and width form the |
| 74 | +// stream metadata, which itself got its value from the CodecContext, when the |
| 75 | +// stream was added. This is used by batch decoding APIs, for both GPU and |
| 76 | +// CPU. |
| 77 | +// |
| 78 | +// The source of truth for height and width really is the (resized) AVFrame: it |
| 79 | +// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e. |
| 80 | +// from the CodecContext) may not be as accurate. However, the AVFrame is only |
| 81 | +// available late in the call stack, when the frame is decoded, while the |
| 82 | +// CodecContext is available early when a stream is added. This is why we use |
| 83 | +// the CodecContext for pre-allocating batched output tensors (we could |
| 84 | +// pre-allocate those only once we decode the first frame to get the info frame |
| 85 | +// the AVFrame, but that's a more complex logic). |
| 86 | +// |
| 87 | +// Because the sources for height and width may disagree, we may end up with |
| 88 | +// conflicts: e.g. if we pre-allocate a batch output tensor based on the |
| 89 | +// metadata info, but the decoded AVFrame has a different height and width. |
| 90 | +// it is very important to check the height and width assumptions where the |
| 91 | +// tensors memory is used/filled in order to avoid segfaults. |
| 92 | + |
| 93 | +struct FrameDims { |
| 94 | + int height; |
| 95 | + int width; |
| 96 | + |
| 97 | + FrameDims(int h, int w) : height(h), width(w) {} |
| 98 | +}; |
| 99 | + |
| 100 | +// There's nothing preventing you from calling this on a non-resized frame, but |
| 101 | +// please don't. |
| 102 | +FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame); |
| 103 | + |
| 104 | +FrameDims getHeightAndWidthFromOptionsOrMetadata( |
| 105 | + const VideoStreamOptions& videoStreamOptions, |
| 106 | + const StreamMetadata& streamMetadata); |
| 107 | + |
| 108 | +FrameDims getHeightAndWidthFromOptionsOrAVFrame( |
| 109 | + const VideoStreamOptions& videoStreamOptions, |
| 110 | + const UniqueAVFrame& avFrame); |
| 111 | + |
| 112 | +torch::Tensor allocateEmptyHWCTensor( |
| 113 | + int height, |
| 114 | + int width, |
| 115 | + torch::Device device, |
| 116 | + std::optional<int> numFrames = std::nullopt); |
| 117 | + |
47 | 118 | } // namespace facebook::torchcodec
|
0 commit comments