pytorch
diff --git a/‎src/torchcodec/_core/CMakeLists.txt
+1-1 b/‎src/torchcodec/_core/CMakeLists.txt
+1-1
diff --git a/‎src/torchcodec/_core/CudaDevice.cpp renamed to ‎src/torchcodec/_core/CudaDeviceInterface.cpp
+14-11 b/‎src/torchcodec/_core/CudaDevice.cpp renamed to ‎src/torchcodec/_core/CudaDeviceInterface.cpp
+14-11
diff --git a/‎src/torchcodec/_core/CudaDevice.h renamed to ‎src/torchcodec/_core/CudaDeviceInterface.h
+5-5 b/‎src/torchcodec/_core/CudaDevice.h renamed to ‎src/torchcodec/_core/CudaDeviceInterface.h
+5-5
diff --git a/‎src/torchcodec/_core/DeviceInterface.h
+4-3 b/‎src/torchcodec/_core/DeviceInterface.h
+4-3
diff --git a/‎src/torchcodec/_core/Encoder.cpp
+8-6 b/‎src/torchcodec/_core/Encoder.cpp
+8-6
diff --git a/‎src/torchcodec/_core/Encoder.h
+7-1 b/‎src/torchcodec/_core/Encoder.h
+7-1
diff --git a/‎src/torchcodec/_core/Frame.h
+47 b/‎src/torchcodec/_core/Frame.h
+47
diff --git a/‎src/torchcodec/_core/Metadata.h
+70 b/‎src/torchcodec/_core/Metadata.h
+70
@@ -68,7 +68,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-        list(APPEND decoder_sources CudaDevice.cpp)
+	    list(APPEND decoder_sources CudaDeviceInterface.cpp)
     endif()
 
     set(decoder_library_dependencies
 
@@ -4,7 +4,7 @@
 #include <torch/types.h>
 #include <mutex>
 
-#include "src/torchcodec/_core/CudaDevice.h"
+#include "src/torchcodec/_core/CudaDeviceInterface.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
 
@@ -16,9 +16,10 @@ extern "C" {
 namespace facebook::torchcodec {
 namespace {
 
-bool g_cuda = registerDeviceInterface(
-    torch::kCUDA,
-    [](const torch::Device& device) { return new CudaDevice(device); });
+bool g_cuda =
+    registerDeviceInterface(torch::kCUDA, [](const torch::Device& device) {
+      return new CudaDeviceInterface(device);
+    });
 
 // We reuse cuda contexts across VideoDeoder instances. This is because
 // creating a cuda context is expensive. The cache mechanism is as follows:
@@ -163,20 +164,21 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 }
 } // namespace
 
-CudaDevice::CudaDevice(const torch::Device& device) : DeviceInterface(device) {
+CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
+    : DeviceInterface(device) {
   if (device_.type() != torch::kCUDA) {
     throw std::runtime_error("Unsupported device: " + device_.str());
   }
 }
 
-CudaDevice::~CudaDevice() {
+CudaDeviceInterface::~CudaDeviceInterface() {
   if (ctx_) {
     addToCacheIfCacheHasCapacity(device_, ctx_);
     av_buffer_unref(&ctx_);
   }
 }
 
-void CudaDevice::initializeContext(AVCodecContext* codecContext) {
+void CudaDeviceInterface::initializeContext(AVCodecContext* codecContext) {
   TORCH_CHECK(!ctx_, "FFmpeg HW device context already initialized");
 
   // It is important for pytorch itself to create the cuda context. If ffmpeg
@@ -189,10 +191,10 @@ void CudaDevice::initializeContext(AVCodecContext* codecContext) {
   return;
 }
 
-void CudaDevice::convertAVFrameToFrameOutput(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+void CudaDeviceInterface::convertAVFrameToFrameOutput(
+    const VideoStreamOptions& videoStreamOptions,
     UniqueAVFrame& avFrame,
-    SingleStreamDecoder::FrameOutput& frameOutput,
+    FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   TORCH_CHECK(
       avFrame->format == AV_PIX_FMT_CUDA,
@@ -263,7 +265,8 @@ void CudaDevice::convertAVFrameToFrameOutput(
 // we have to do this because of an FFmpeg bug where hardware decoding is not
 // appropriately set, so we just go off and find the matching codec for the CUDA
 // device
-std::optional<const AVCodec*> CudaDevice::findCodec(const AVCodecID& codecId) {
+std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
+    const AVCodecID& codecId) {
   void* i = nullptr;
   const AVCodec* codec = nullptr;
   while ((codec = av_codec_iterate(&i)) != nullptr) {
 
@@ -10,20 +10,20 @@
 
 namespace facebook::torchcodec {
 
-class CudaDevice : public DeviceInterface {
+class CudaDeviceInterface : public DeviceInterface {
  public:
-  CudaDevice(const torch::Device& device);
+  CudaDeviceInterface(const torch::Device& device);
 
-  virtual ~CudaDevice();
+  virtual ~CudaDeviceInterface();
 
   std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
 
   void initializeContext(AVCodecContext* codecContext) override;
 
   void convertAVFrameToFrameOutput(
-      const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+      const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
-      SingleStreamDecoder::FrameOutput& frameOutput,
+      FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
 
 
@@ -12,7 +12,8 @@
 #include <stdexcept>
 #include <string>
 #include "FFMPEGCommon.h"
-#include "src/torchcodec/_core/SingleStreamDecoder.h"
+#include "src/torchcodec/_core/Frame.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 
@@ -41,9 +42,9 @@ class DeviceInterface {
   virtual void initializeContext(AVCodecContext* codecContext) = 0;
 
   virtual void convertAVFrameToFrameOutput(
-      const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+      const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
-      SingleStreamDecoder::FrameOutput& frameOutput,
+      FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
 
  protected:
 
@@ -8,7 +8,8 @@ AudioEncoder::~AudioEncoder() {}
 AudioEncoder::AudioEncoder(
     const torch::Tensor wf,
     int sampleRate,
-    std::string_view fileName)
+    std::string_view fileName,
+    std::optional<int64_t> bit_rate)
     : wf_(wf), sampleRate_(sampleRate) {
   TORCH_CHECK(
       wf_.dtype() == torch::kFloat32,
@@ -49,11 +50,12 @@ AudioEncoder::AudioEncoder(
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
   avCodecContext_.reset(avCodecContext);
 
-  // TODO-ENCODING I think this sets the bit rate to the minimum supported.
-  // That's not what the ffmpeg CLI would choose by default, so we should try to
-  // do the same.
-  // TODO-ENCODING Should also let user choose for compressed formats like mp3.
-  avCodecContext_->bit_rate = 0;
+  if (bit_rate.has_value()) {
+    TORCH_CHECK(*bit_rate >= 0, "bit_rate=", *bit_rate, " must be >= 0.");
+  }
+  // bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as
+  // well when "-b:a" isn't specified.
+  avCodecContext_->bit_rate = bit_rate.value_or(0);
 
   avCodecContext_->sample_rate = sampleRate_;
 
 
@@ -7,10 +7,16 @@ class AudioEncoder {
  public:
   ~AudioEncoder();
 
+  // TODO-ENCODING: document in public docs that bit_rate value is only
+  // best-effort, matching to the closest supported bit_rate. I.e. passing 1 is
+  // like passing 0, which results in choosing the minimum supported bit rate.
+  // Passing 44_100 could result in output being 44000 if only 44000 is
+  // supported.
   AudioEncoder(
       const torch::Tensor wf,
       int sampleRate,
-      std::string_view fileName);
+      std::string_view fileName,
+      std::optional<int64_t> bit_rate = std::nullopt);
   void encode();
 
  private:
 
@@ -0,0 +1,47 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/types.h>
+#include "src/torchcodec/_core/Metadata.h"
+#include "src/torchcodec/_core/StreamOptions.h"
+
+namespace facebook::torchcodec {
+
+// All public video decoding entry points return either a FrameOutput or a
+// FrameBatchOutput.
+// They are the equivalent of the user-facing Frame and FrameBatch classes in
+// Python. They contain RGB decoded frames along with some associated data
+// like PTS and duration.
+// FrameOutput is also relevant for audio decoding, typically as the output of
+// getNextFrame(), or as a temporary output variable.
+struct FrameOutput {
+  // data shape is:
+  // - 3D (C, H, W) or (H, W, C) for videos
+  // - 2D (numChannels, numSamples) for audio
+  torch::Tensor data;
+  double ptsSeconds;
+  double durationSeconds;
+};
+
+struct FrameBatchOutput {
+  torch::Tensor data; // 4D: of shape NCHW or NHWC.
+  torch::Tensor ptsSeconds; // 1D of shape (N,)
+  torch::Tensor durationSeconds; // 1D of shape (N,)
+
+  explicit FrameBatchOutput(
+      int64_t numFrames,
+      const VideoStreamOptions& videoStreamOptions,
+      const StreamMetadata& streamMetadata);
+};
+
+struct AudioFramesOutput {
+  torch::Tensor data; // shape is (numChannels, numSamples)
+  double ptsSeconds;
+};
+
+} // namespace facebook::torchcodec
@@ -0,0 +1,70 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/avutil.h>
+}
+
+namespace facebook::torchcodec {
+
+struct StreamMetadata {
+  // Common (video and audio) fields derived from the AVStream.
+  int streamIndex;
+  // See this link for what various values are available:
+  // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
+  AVMediaType mediaType;
+  std::optional<AVCodecID> codecId;
+  std::optional<std::string> codecName;
+  std::optional<double> durationSeconds;
+  std::optional<double> beginStreamFromHeader;
+  std::optional<int64_t> numFrames;
+  std::optional<int64_t> numKeyFrames;
+  std::optional<double> averageFps;
+  std::optional<double> bitRate;
+
+  // More accurate duration, obtained by scanning the file.
+  // These presentation timestamps are in time base.
+  std::optional<int64_t> minPtsFromScan;
+  std::optional<int64_t> maxPtsFromScan;
+  // These presentation timestamps are in seconds.
+  std::optional<double> minPtsSecondsFromScan;
+  std::optional<double> maxPtsSecondsFromScan;
+  // This can be useful for index-based seeking.
+  std::optional<int64_t> numFramesFromScan;
+
+  // Video-only fields derived from the AVCodecContext.
+  std::optional<int64_t> width;
+  std::optional<int64_t> height;
+
+  // Audio-only fields
+  std::optional<int64_t> sampleRate;
+  std::optional<int64_t> numChannels;
+  std::optional<std::string> sampleFormat;
+};
+
+struct ContainerMetadata {
+  std::vector<StreamMetadata> allStreamMetadata;
+  int numAudioStreams = 0;
+  int numVideoStreams = 0;
+  // Note that this is the container-level duration, which is usually the max
+  // of all stream durations available in the container.
+  std::optional<double> durationSeconds;
+  // Total BitRate level information at the container level in bit/s
+  std::optional<double> bitRate;
+  // If set, this is the index to the default audio stream.
+  std::optional<int> bestAudioStreamIndex;
+  // If set, this is the index to the default video stream.
+  std::optional<int> bestVideoStreamIndex;
+};
+
+} // namespace facebook::torchcodec
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ function(make_torchcodec_libraries`
`68`	`68`	`)`
`69`	`69`
`70`	`70`	`if(ENABLE_CUDA)`
`71`		`- list(APPEND decoder_sources CudaDevice.cpp)`
	`71`	`+ list(APPEND decoder_sources CudaDeviceInterface.cpp)`
`72`	`72`	`endif()`
`73`	`73`
`74`	`74`	`set(decoder_library_dependencies`