Skip to content

Commit f479f90

Browse files
committed
Move frame tensor allocation APIs to Frame.h
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
1 parent 467adec commit f479f90

File tree

6 files changed

+104
-92
lines changed

6 files changed

+104
-92
lines changed

src/torchcodec/_core/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ function(make_torchcodec_libraries
6060
set(decoder_sources
6161
AVIOContextHolder.cpp
6262
FFMPEGCommon.cpp
63+
Frame.cpp
6364
DeviceInterface.cpp
6465
CpuDeviceInterface.cpp
6566
SingleStreamDecoder.cpp

src/torchcodec/_core/CudaDeviceInterface.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
#include "src/torchcodec/_core/CudaDeviceInterface.h"
88
#include "src/torchcodec/_core/FFMPEGCommon.h"
9-
#include "src/torchcodec/_core/SingleStreamDecoder.h"
109

1110
extern "C" {
1211
#include <libavutil/hwcontext_cuda.h>

src/torchcodec/_core/Frame.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright (c) Meta Platforms, Inc. and affiliates.
2+
// All rights reserved.
3+
//
4+
// This source code is licensed under the BSD-style license found in the
5+
// LICENSE file in the root directory of this source tree.
6+
7+
#include "src/torchcodec/_core/Frame.h"
8+
9+
namespace facebook::torchcodec {
10+
11+
torch::Tensor allocateEmptyHWCTensor(
12+
int height,
13+
int width,
14+
torch::Device device,
15+
std::optional<int> numFrames) {
16+
auto tensorOptions = torch::TensorOptions()
17+
.dtype(torch::kUInt8)
18+
.layout(torch::kStrided)
19+
.device(device);
20+
TORCH_CHECK(height > 0, "height must be > 0, got: ", height);
21+
TORCH_CHECK(width > 0, "width must be > 0, got: ", width);
22+
if (numFrames.has_value()) {
23+
auto numFramesValue = numFrames.value();
24+
TORCH_CHECK(
25+
numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
26+
return torch::empty({numFramesValue, height, width, 3}, tensorOptions);
27+
} else {
28+
return torch::empty({height, width, 3}, tensorOptions);
29+
}
30+
}
31+
32+
} // namespace facebook::torchcodec

src/torchcodec/_core/Frame.h

+71
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#pragma once
88

99
#include <torch/types.h>
10+
#include "src/torchcodec/_core/FFMPEGCommon.h"
1011
#include "src/torchcodec/_core/Metadata.h"
1112
#include "src/torchcodec/_core/StreamOptions.h"
1213

@@ -44,4 +45,74 @@ struct AudioFramesOutput {
4445
double ptsSeconds;
4546
};
4647

48+
// --------------------------------------------------------------------------
49+
// FRAME TENSOR ALLOCATION APIs
50+
// --------------------------------------------------------------------------
51+
52+
// Note [Frame Tensor allocation and height and width]
53+
//
54+
// We always allocate [N]HWC tensors. The low-level decoding functions all
55+
// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
56+
// the high-level decoding entry-points to permute that back to CHW, by calling
57+
// maybePermuteHWC2CHW().
58+
//
59+
// Also, importantly, the way we figure out the the height and width of the
60+
// output frame tensor varies, and depends on the decoding entry-point. In
61+
// *decreasing order of accuracy*, we use the following sources for determining
62+
// height and width:
63+
// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
64+
// AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
65+
// on CPU, with filtergraph.
66+
// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
67+
// the user-specified options if they exist, or the height and width of the
68+
// AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
69+
// our code or within FFmpeg code, this should be exactly the same as
70+
// getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
71+
// decoding APIs, on CPU with swscale, and on GPU.
72+
// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
73+
// the user-specified options if they exist, or the height and width form the
74+
// stream metadata, which itself got its value from the CodecContext, when the
75+
// stream was added. This is used by batch decoding APIs, for both GPU and
76+
// CPU.
77+
//
78+
// The source of truth for height and width really is the (resized) AVFrame: it
79+
// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
80+
// from the CodecContext) may not be as accurate. However, the AVFrame is only
81+
// available late in the call stack, when the frame is decoded, while the
82+
// CodecContext is available early when a stream is added. This is why we use
83+
// the CodecContext for pre-allocating batched output tensors (we could
84+
// pre-allocate those only once we decode the first frame to get the info frame
85+
// the AVFrame, but that's a more complex logic).
86+
//
87+
// Because the sources for height and width may disagree, we may end up with
88+
// conflicts: e.g. if we pre-allocate a batch output tensor based on the
89+
// metadata info, but the decoded AVFrame has a different height and width.
90+
// it is very important to check the height and width assumptions where the
91+
// tensors memory is used/filled in order to avoid segfaults.
92+
93+
struct FrameDims {
94+
int height;
95+
int width;
96+
97+
FrameDims(int h, int w) : height(h), width(w) {}
98+
};
99+
100+
// There's nothing preventing you from calling this on a non-resized frame, but
101+
// please don't.
102+
FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
103+
104+
FrameDims getHeightAndWidthFromOptionsOrMetadata(
105+
const VideoStreamOptions& videoStreamOptions,
106+
const StreamMetadata& streamMetadata);
107+
108+
FrameDims getHeightAndWidthFromOptionsOrAVFrame(
109+
const VideoStreamOptions& videoStreamOptions,
110+
const UniqueAVFrame& avFrame);
111+
112+
torch::Tensor allocateEmptyHWCTensor(
113+
int height,
114+
int width,
115+
torch::Device device,
116+
std::optional<int> numFrames = std::nullopt);
117+
47118
} // namespace facebook::torchcodec

src/torchcodec/_core/SingleStreamDecoder.cpp

-21
Original file line numberDiff line numberDiff line change
@@ -1448,27 +1448,6 @@ FrameBatchOutput::FrameBatchOutput(
14481448
height, width, videoStreamOptions.device, numFrames);
14491449
}
14501450

1451-
torch::Tensor allocateEmptyHWCTensor(
1452-
int height,
1453-
int width,
1454-
torch::Device device,
1455-
std::optional<int> numFrames) {
1456-
auto tensorOptions = torch::TensorOptions()
1457-
.dtype(torch::kUInt8)
1458-
.layout(torch::kStrided)
1459-
.device(device);
1460-
TORCH_CHECK(height > 0, "height must be > 0, got: ", height);
1461-
TORCH_CHECK(width > 0, "width must be > 0, got: ", width);
1462-
if (numFrames.has_value()) {
1463-
auto numFramesValue = numFrames.value();
1464-
TORCH_CHECK(
1465-
numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
1466-
return torch::empty({numFramesValue, height, width, 3}, tensorOptions);
1467-
} else {
1468-
return torch::empty({height, width, 3}, tensorOptions);
1469-
}
1470-
}
1471-
14721451
// Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
14731452
// The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
14741453
// or 4D.

src/torchcodec/_core/SingleStreamDecoder.h

-70
Original file line numberDiff line numberDiff line change
@@ -345,76 +345,6 @@ class SingleStreamDecoder {
345345
bool initialized_ = false;
346346
};
347347

348-
// --------------------------------------------------------------------------
349-
// FRAME TENSOR ALLOCATION APIs
350-
// --------------------------------------------------------------------------
351-
352-
// Note [Frame Tensor allocation and height and width]
353-
//
354-
// We always allocate [N]HWC tensors. The low-level decoding functions all
355-
// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
356-
// the high-level decoding entry-points to permute that back to CHW, by calling
357-
// maybePermuteHWC2CHW().
358-
//
359-
// Also, importantly, the way we figure out the the height and width of the
360-
// output frame tensor varies, and depends on the decoding entry-point. In
361-
// *decreasing order of accuracy*, we use the following sources for determining
362-
// height and width:
363-
// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
364-
// AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
365-
// on CPU, with filtergraph.
366-
// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
367-
// the user-specified options if they exist, or the height and width of the
368-
// AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
369-
// our code or within FFmpeg code, this should be exactly the same as
370-
// getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
371-
// decoding APIs, on CPU with swscale, and on GPU.
372-
// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
373-
// the user-specified options if they exist, or the height and width form the
374-
// stream metadata, which itself got its value from the CodecContext, when the
375-
// stream was added. This is used by batch decoding APIs, for both GPU and
376-
// CPU.
377-
//
378-
// The source of truth for height and width really is the (resized) AVFrame: it
379-
// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
380-
// from the CodecContext) may not be as accurate. However, the AVFrame is only
381-
// available late in the call stack, when the frame is decoded, while the
382-
// CodecContext is available early when a stream is added. This is why we use
383-
// the CodecContext for pre-allocating batched output tensors (we could
384-
// pre-allocate those only once we decode the first frame to get the info frame
385-
// the AVFrame, but that's a more complex logic).
386-
//
387-
// Because the sources for height and width may disagree, we may end up with
388-
// conflicts: e.g. if we pre-allocate a batch output tensor based on the
389-
// metadata info, but the decoded AVFrame has a different height and width.
390-
// it is very important to check the height and width assumptions where the
391-
// tensors memory is used/filled in order to avoid segfaults.
392-
393-
struct FrameDims {
394-
int height;
395-
int width;
396-
397-
FrameDims(int h, int w) : height(h), width(w) {}
398-
};
399-
400-
// There's nothing preventing you from calling this on a non-resized frame, but
401-
// please don't.
402-
FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
403-
404-
FrameDims getHeightAndWidthFromOptionsOrMetadata(
405-
const VideoStreamOptions& videoStreamOptions,
406-
const StreamMetadata& streamMetadata);
407-
408-
FrameDims getHeightAndWidthFromOptionsOrAVFrame(
409-
const VideoStreamOptions& videoStreamOptions,
410-
const UniqueAVFrame& avFrame);
411-
412-
torch::Tensor allocateEmptyHWCTensor(
413-
int height,
414-
int width,
415-
torch::Device device,
416-
std::optional<int> numFrames = std::nullopt);
417-
418348
// Prints the SingleStreamDecoder::DecodeStats to the ostream.
419349
std::ostream& operator<<(
420350
std::ostream& os,

0 commit comments

Comments
 (0)