.

ahmadsharif1 · ahmadsharif1 · commit 851b399b008e · 2024-10-29T11:53:19.000-07:00
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -201,7 +201,8 @@ void convertAVFrameToDecodedOutputOnCuda(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     AVCodecContext* codecContext,
     VideoDecoder::RawDecodedOutput& rawOutput,
-    VideoDecoder::DecodedOutput& output) {
+    VideoDecoder::DecodedOutput& output,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   AVFrame* src = rawOutput.frame.get();
 
   TORCH_CHECK(
@@ -213,7 +214,21 @@ void convertAVFrameToDecodedOutputOnCuda(
   NppiSize oSizeROI = {width, height};
   Npp8u* input[2] = {src->data[0], src->data[1]};
   torch::Tensor& dst = output.frame;
-  dst = allocateDeviceTensor({height, width, 3}, options.device);
+  if (preAllocatedOutputTensor.has_value()) {
+    dst = preAllocatedOutputTensor.value();
+    auto shape = dst.sizes();
+    TORCH_CHECK(
+        (shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
+            (shape[2] == 3),
+        "Expected tensor of shape ",
+        height,
+        "x",
+        width,
+        "x3, got ",
+        shape);
+  } else {
+    dst = allocateDeviceTensor({height, width, 3}, options.device);
+  }
 
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device);
diff --git a/src/torchcodec/decoders/_core/DeviceInterface.h b/src/torchcodec/decoders/_core/DeviceInterface.h
@@ -37,7 +37,8 @@ void convertAVFrameToDecodedOutputOnCuda(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     AVCodecContext* codecContext,
     VideoDecoder::RawDecodedOutput& rawOutput,
-    VideoDecoder::DecodedOutput& output);
+    VideoDecoder::DecodedOutput& output,
+    std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
 void releaseContextOnCuda(
     const torch::Device& device,
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -196,7 +196,7 @@ VideoDecoder::BatchDecodedOutput::BatchDecodedOutput(
            options.height.value_or(*metadata.height),
            options.width.value_or(*metadata.width),
            3},
-          {torch::kUInt8})),
+          at::TensorOptions(options.device).dtype(torch::kUInt8))),
       ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
       durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {}
 
@@ -859,13 +859,14 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
     convertAVFrameToDecodedOutputOnCPU(
         rawOutput, output, preAllocatedOutputTensor);
   } else if (streamInfo.options.device.type() == torch::kCUDA) {
-    // TODO: handle pre-allocated output tensor
+    // TODO: we should fold preAllocatedOutputTensor into RawDecodedOutput.
     convertAVFrameToDecodedOutputOnCuda(
         streamInfo.options.device,
         streamInfo.options,
         streamInfo.codecContext.get(),
         rawOutput,
-        output);
+        output,
+        preAllocatedOutputTensor);
   } else {
     TORCH_CHECK(
         false, "Invalid device type: " + streamInfo.options.device.str());
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -57,6 +57,12 @@ def seek(self, pts: float):
         seek_to_pts(self.decoder, pts)
 
 
+# Asserts that at most percentage of the elements are different by more than abs_tolerance.
+def assert_tensor_nearly_equal(frame1, frame2, percentage=0.3, abs_tolerance=20):
+    diff = (frame2.float() - frame1.float()).abs()
+    assert (diff > abs_tolerance).float().mean() <= percentage / 100.0
+
+
 class TestOps:
     def test_seek_and_next(self):
         decoder = create_from_file(str(NASA_VIDEO.path))
@@ -137,6 +143,24 @@ def test_get_frames_at_indices(self):
         assert_tensor_equal(frames0and180[0], reference_frame0)
         assert_tensor_equal(frames0and180[1], reference_frame180)
 
+    @needs_cuda
+    def test_get_frames_at_indices_with_cuda(self):
+        decoder = create_from_file(str(NASA_VIDEO.path))
+        scan_all_streams_to_update_metadata(decoder)
+        add_video_stream(decoder, device="cuda")
+        frames0and180, *_ = get_frames_at_indices(
+            decoder, stream_index=3, frame_indices=[0, 180]
+        )
+        reference_frame0 = NASA_VIDEO.get_frame_data_by_index(0)
+        reference_frame180 = NASA_VIDEO.get_frame_data_by_index(
+            INDEX_OF_FRAME_AT_6_SECONDS
+        )
+        assert frames0and180.device.type == "cuda"
+        assert_tensor_nearly_equal(frames0and180[0].to("cpu"), reference_frame0)
+        assert_tensor_nearly_equal(
+            frames0and180[1].to("cpu"), reference_frame180, 0.3, 30
+        )
+
     def test_get_frames_at_indices_unsorted_indices(self):
         decoder = create_from_file(str(NASA_VIDEO.path))
         _add_video_stream(decoder)
@@ -657,8 +681,8 @@ def test_cuda_decoder(self):
         assert frame0.device.type == "cuda"
         frame0_cpu = frame0.to("cpu")
         reference_frame0 = NASA_VIDEO.get_frame_data_by_index(0)
-        # GPU decode is not bit-accurate. In the following assertion we ensure
-        # not more than 0.3% of values have a difference greater than 20.
+        # GPU decode is not bit-accurate. So we allow some tolerance.
+        assert_tensor_nearly_equal(frame0_cpu, reference_frame0)
         diff = (reference_frame0.float() - frame0_cpu.float()).abs()
         assert (diff > 20).float().mean() <= 0.003
         assert pts == torch.tensor([0])