Move frame tensor allocation APIs to Frame.h

dvrogozh · dvrogozh · commit f479f90c4af9 · 2025-04-16T19:38:52.000Z
Signed-off-by: Dmitry Rogozhkin &lt;dmitry.v.rogozhkin@intel.com&gt;
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -60,6 +60,7 @@ function(make_torchcodec_libraries
     set(decoder_sources
         AVIOContextHolder.cpp
         FFMPEGCommon.cpp
+        Frame.cpp
         DeviceInterface.cpp
         CpuDeviceInterface.cpp
         SingleStreamDecoder.cpp
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -6,7 +6,6 @@
 
 #include "src/torchcodec/_core/CudaDeviceInterface.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
-#include "src/torchcodec/_core/SingleStreamDecoder.h"
 
 extern "C" {
 #include <libavutil/hwcontext_cuda.h>
diff --git a/src/torchcodec/_core/Frame.cpp b/src/torchcodec/_core/Frame.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "src/torchcodec/_core/Frame.h"
+
+namespace facebook::torchcodec {
+
+torch::Tensor allocateEmptyHWCTensor(
+    int height,
+    int width,
+    torch::Device device,
+    std::optional<int> numFrames) {
+  auto tensorOptions = torch::TensorOptions()
+                           .dtype(torch::kUInt8)
+                           .layout(torch::kStrided)
+                           .device(device);
+  TORCH_CHECK(height > 0, "height must be > 0, got: ", height);
+  TORCH_CHECK(width > 0, "width must be > 0, got: ", width);
+  if (numFrames.has_value()) {
+    auto numFramesValue = numFrames.value();
+    TORCH_CHECK(
+        numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
+    return torch::empty({numFramesValue, height, width, 3}, tensorOptions);
+  } else {
+    return torch::empty({height, width, 3}, tensorOptions);
+  }
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <torch/types.h>
+#include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/Metadata.h"
 #include "src/torchcodec/_core/StreamOptions.h"
 
@@ -44,4 +45,74 @@ struct AudioFramesOutput {
   double ptsSeconds;
 };
 
+// --------------------------------------------------------------------------
+// FRAME TENSOR ALLOCATION APIs
+// --------------------------------------------------------------------------
+
+// Note [Frame Tensor allocation and height and width]
+//
+// We always allocate [N]HWC tensors. The low-level decoding functions all
+// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
+// the high-level decoding entry-points to permute that back to CHW, by calling
+// maybePermuteHWC2CHW().
+//
+// Also, importantly, the way we figure out the the height and width of the
+// output frame tensor varies, and depends on the decoding entry-point. In
+// *decreasing order of accuracy*, we use the following sources for determining
+// height and width:
+// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
+//   AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
+//   on CPU, with filtergraph.
+// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
+//   the user-specified options if they exist, or the height and width of the
+//   AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
+//   our code or within FFmpeg code, this should be exactly the same as
+//   getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
+//   decoding APIs, on CPU with swscale, and on GPU.
+// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
+//   the user-specified options if they exist, or the height and width form the
+//   stream metadata, which itself got its value from the CodecContext, when the
+//   stream was added. This is used by batch decoding APIs, for both GPU and
+//   CPU.
+//
+// The source of truth for height and width really is the (resized) AVFrame: it
+// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
+// from the CodecContext) may not be as accurate. However, the AVFrame is only
+// available late in the call stack, when the frame is decoded, while the
+// CodecContext is available early when a stream is added. This is why we use
+// the CodecContext for pre-allocating batched output tensors (we could
+// pre-allocate those only once we decode the first frame to get the info frame
+// the AVFrame, but that's a more complex logic).
+//
+// Because the sources for height and width may disagree, we may end up with
+// conflicts: e.g. if we pre-allocate a batch output tensor based on the
+// metadata info, but the decoded AVFrame has a different height and width.
+// it is very important to check the height and width assumptions where the
+// tensors memory is used/filled in order to avoid segfaults.
+
+struct FrameDims {
+  int height;
+  int width;
+
+  FrameDims(int h, int w) : height(h), width(w) {}
+};
+
+// There's nothing preventing you from calling this on a non-resized frame, but
+// please don't.
+FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
+
+FrameDims getHeightAndWidthFromOptionsOrMetadata(
+    const VideoStreamOptions& videoStreamOptions,
+    const StreamMetadata& streamMetadata);
+
+FrameDims getHeightAndWidthFromOptionsOrAVFrame(
+    const VideoStreamOptions& videoStreamOptions,
+    const UniqueAVFrame& avFrame);
+
+torch::Tensor allocateEmptyHWCTensor(
+    int height,
+    int width,
+    torch::Device device,
+    std::optional<int> numFrames = std::nullopt);
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1448,27 +1448,6 @@ FrameBatchOutput::FrameBatchOutput(
       height, width, videoStreamOptions.device, numFrames);
 }
 
-torch::Tensor allocateEmptyHWCTensor(
-    int height,
-    int width,
-    torch::Device device,
-    std::optional<int> numFrames) {
-  auto tensorOptions = torch::TensorOptions()
-                           .dtype(torch::kUInt8)
-                           .layout(torch::kStrided)
-                           .device(device);
-  TORCH_CHECK(height > 0, "height must be > 0, got: ", height);
-  TORCH_CHECK(width > 0, "width must be > 0, got: ", width);
-  if (numFrames.has_value()) {
-    auto numFramesValue = numFrames.value();
-    TORCH_CHECK(
-        numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
-    return torch::empty({numFramesValue, height, width, 3}, tensorOptions);
-  } else {
-    return torch::empty({height, width, 3}, tensorOptions);
-  }
-}
-
 // Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
 // The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
 // or 4D.
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -345,76 +345,6 @@ class SingleStreamDecoder {
   bool initialized_ = false;
 };
 
-// --------------------------------------------------------------------------
-// FRAME TENSOR ALLOCATION APIs
-// --------------------------------------------------------------------------
-
-// Note [Frame Tensor allocation and height and width]
-//
-// We always allocate [N]HWC tensors. The low-level decoding functions all
-// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
-// the high-level decoding entry-points to permute that back to CHW, by calling
-// maybePermuteHWC2CHW().
-//
-// Also, importantly, the way we figure out the the height and width of the
-// output frame tensor varies, and depends on the decoding entry-point. In
-// *decreasing order of accuracy*, we use the following sources for determining
-// height and width:
-// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
-//   AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
-//   on CPU, with filtergraph.
-// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
-//   the user-specified options if they exist, or the height and width of the
-//   AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
-//   our code or within FFmpeg code, this should be exactly the same as
-//   getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
-//   decoding APIs, on CPU with swscale, and on GPU.
-// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
-//   the user-specified options if they exist, or the height and width form the
-//   stream metadata, which itself got its value from the CodecContext, when the
-//   stream was added. This is used by batch decoding APIs, for both GPU and
-//   CPU.
-//
-// The source of truth for height and width really is the (resized) AVFrame: it
-// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
-// from the CodecContext) may not be as accurate. However, the AVFrame is only
-// available late in the call stack, when the frame is decoded, while the
-// CodecContext is available early when a stream is added. This is why we use
-// the CodecContext for pre-allocating batched output tensors (we could
-// pre-allocate those only once we decode the first frame to get the info frame
-// the AVFrame, but that's a more complex logic).
-//
-// Because the sources for height and width may disagree, we may end up with
-// conflicts: e.g. if we pre-allocate a batch output tensor based on the
-// metadata info, but the decoded AVFrame has a different height and width.
-// it is very important to check the height and width assumptions where the
-// tensors memory is used/filled in order to avoid segfaults.
-
-struct FrameDims {
-  int height;
-  int width;
-
-  FrameDims(int h, int w) : height(h), width(w) {}
-};
-
-// There's nothing preventing you from calling this on a non-resized frame, but
-// please don't.
-FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
-
-FrameDims getHeightAndWidthFromOptionsOrMetadata(
-    const VideoStreamOptions& videoStreamOptions,
-    const StreamMetadata& streamMetadata);
-
-FrameDims getHeightAndWidthFromOptionsOrAVFrame(
-    const VideoStreamOptions& videoStreamOptions,
-    const UniqueAVFrame& avFrame);
-
-torch::Tensor allocateEmptyHWCTensor(
-    int height,
-    int width,
-    torch::Device device,
-    std::optional<int> numFrames = std::nullopt);
-
 // Prints the SingleStreamDecoder::DecodeStats to the ostream.
 std::ostream& operator<<(
     std::ostream& os,