Merge branch 'main' of github.com:pytorch/torchcodec into mac_wheels_ci

scotts · scotts · commit 882ef9fa6f3b · 2024-10-28T06:40:59.000-07:00
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
@@ -0,0 +1,144 @@
+name: Build and test Linux CUDA wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: write
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cpu: disable
+      with-xpu: disable
+      with-rocm: disable
+      with-cuda: enable
+      build-python-only: "disable"
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+    name: Build and Upload wheel
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: pytorch/torchcodec
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      post-script: packaging/post_build_script.sh
+      smoke-test-script: packaging/fake_smoke_test.py
+      package-name: torchcodec
+      trigger-event: ${{ github.event_name }}
+      build-platform: "python-build-package"
+      build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 ENABLE_CUDA=1 python -m build --wheel -vvv --no-isolation"
+
+  install-and-test:
+    runs-on: linux.4xlarge.nvidia.gpu
+    strategy:
+      fail-fast: false
+      matrix:
+          # 3.9 corresponds to the minimum python version for which we build
+          # the wheel unless the label cliflow/binaries/all is present in the
+          # PR.
+          # For the actual release we should add that label and change this to
+          # include more python versions.
+        python-version: ['3.9']
+        cuda-version: ['11.8', '12.1', '12.4']
+        ffmpeg-version-for-tests: ['5', '6', '7']
+    container:
+      image: "pytorch/manylinux-builder:cuda${{ matrix.cuda-version }}"
+      options: "--gpus all -e NVIDIA_DRIVER_CAPABILITIES=video,compute,utility"
+    if: ${{ always() }}
+    needs: build
+    steps:
+      - name: Setup env vars
+        run: |
+          cuda_version_without_periods=$(echo "${{ matrix.cuda-version }}" | sed 's/\.//g')
+          echo cuda_version_without_periods=${cuda_version_without_periods} >> $GITHUB_ENV
+      - uses: actions/download-artifact@v3
+        with:
+          name: pytorch_torchcodec__3.9_cu${{ env.cuda_version_without_periods }}_x86_64
+          path: pytorch/torchcodec/dist/
+      - name: Setup miniconda using test-infra
+        uses: ahmadsharif1/test-infra/.github/actions/setup-miniconda@14bc3c29f88d13b0237ab4ddf00aa409e45ade40
+        with:
+          python-version: ${{ matrix.python-version }}
+          default-packages: "conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }}"
+      - name: Check env
+        run: |
+          ${CONDA_RUN} env
+          ${CONDA_RUN} conda info
+          ${CONDA_RUN} nvidia-smi
+      - name: Update pip
+        run: ${CONDA_RUN} python -m pip install --upgrade pip
+      - name: Install PyTorch
+        run: |
+          ${CONDA_RUN} python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu${{ env.cuda_version_without_periods }}
+          ${CONDA_RUN} python -c 'import torch; print(f"{torch.__version__}"); print(f"{torch.__file__}"); print(f"{torch.cuda.is_available()=}")'
+      - name: Install torchcodec from the wheel
+        run: |
+          wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
+          echo Installing $wheel_path
+          ${CONDA_RUN} python -m pip install $wheel_path -vvv
+
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Install cuda runtime dependencies
+        run: |
+          # For some reason nvidia::libnpp=12.4 doesn't install but nvidia/label/cuda-12.4.0::libnpp does.
+          # So we use the latter convention for libnpp.
+          ${CONDA_RUN} conda install --yes nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }}
+      - name: Install test dependencies
+        run: |
+          ${CONDA_RUN} python -m pip install --pre torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+          # Ideally we would find a way to get those dependencies from pyproject.toml
+          ${CONDA_RUN} python -m pip install numpy pytest pillow
+
+      - name: Delete the src/ folder just for fun
+        run: |
+          # The only reason we checked-out the repo is to get access to the
+          # tests. We don't care about the rest. Out of precaution, we delete
+          # the src/ folder to be extra sure that we're running the code from
+          # the installed wheel rather than from the source.
+          # This is just to be extra cautious and very overkill because a)
+          # there's no way the `torchcodec` package from src/ can be found from
+          # the PythonPath: the main point of `src/` is precisely to protect
+          # against that and b) if we ever were to execute code from
+          # `src/torchcodec`, it would fail loudly because the built .so files
+          # aren't present there.
+          rm -r src/
+          ls
+      - name: Smoke test
+        run: |
+          ${CONDA_RUN} python test/decoders/manual_smoke_test.py
+      - name: Run Python tests
+        run: |
+          # We skip test_get_ffmpeg_version because it may not have a micro version.
+          ${CONDA_RUN} FAIL_WITHOUT_CUDA=1 pytest test -k "not test_get_ffmpeg_version" -vvv
+      - name: Run Python benchmark
+        run: |
+          ${CONDA_RUN} time python benchmarks/decoders/gpu_benchmark.py --devices=cuda:0,cpu --resize_devices=none
diff --git a/.github/workflows/macos_wheel.yaml b/.github/workflows/macos_wheel.yaml
@@ -26,7 +26,7 @@ defaults:
 jobs:
 
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@macbuildwheel
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
     with:
       package-type: wheel
       os: macos-arm64
@@ -42,12 +42,12 @@ jobs:
     strategy:
       fail-fast: false
     name: Build and Upload Mac wheel
-    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@macbuildwheel
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
     with:
       repository: pytorch/torchcodec
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: macbuildwheel
+      test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       post-script: packaging/post_build_script.sh
       smoke-test-script: packaging/fake_smoke_test.py
@@ -58,7 +58,7 @@ jobs:
       build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 python -m build --wheel -vvv --no-isolation"
 
   install-and-test:
-    runs-on: macos-m1-stable
+    runs-on: macos-14-xlarge
     strategy:
       fail-fast: false
       matrix:
diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -eux
+set -ex
 
 source packaging/helpers.sh
 
@@ -18,7 +18,12 @@ else
     exit 1
 fi
 
-for ffmpeg_major_version in 4 5 6 7; do
+# TODO: Make ffmpeg4 work with nvcc.
+if [[ "$ENABLE_CUDA" -eq 1 ]]; then
+  ffmpeg_versions=(5 6 7)
+fi
+
+for ffmpeg_major_version in ${ffmpeg_versions[@]}; do
     assert_in_wheel $wheel_path torchcodec/libtorchcodec${ffmpeg_major_version}.${ext}
 done
 assert_not_in_wheel $wheel_path libtorchcodec.${ext}
diff --git a/src/torchcodec/decoders/_core/CMakeLists.txt b/src/torchcodec/decoders/_core/CMakeLists.txt
@@ -37,7 +37,7 @@ function(make_torchcodec_library library_name ffmpeg_target)
     set(NEEDED_LIBRARIES ${ffmpeg_target} ${TORCH_LIBRARIES}
         ${Python3_LIBRARIES})
     if(ENABLE_CUDA)
-        list(APPEND NEEDED_LIBRARIES ${CUDA_CUDA_LIBRARY}
+        list(APPEND NEEDED_LIBRARIES
             ${CUDA_nppi_LIBRARY} ${CUDA_nppicc_LIBRARY} )
     endif()
     target_link_libraries(
@@ -76,10 +76,15 @@ if(DEFINED ENV{BUILD_AGAINST_ALL_FFMPEG_FROM_S3})
         ${CMAKE_CURRENT_SOURCE_DIR}/fetch_and_expose_non_gpl_ffmpeg_libs.cmake
     )
 
-    make_torchcodec_library(libtorchcodec4 ffmpeg4)
-    make_torchcodec_library(libtorchcodec5 ffmpeg5)
-    make_torchcodec_library(libtorchcodec6 ffmpeg6)
-    make_torchcodec_library(libtorchcodec7 ffmpeg7)
+
+    if(NOT ENABLE_CUDA)
+	    # TODO: Enable more ffmpeg versions for cuda.
+	    make_torchcodec_library(libtorchcodec4 ffmpeg4)
+	endif()
+	make_torchcodec_library(libtorchcodec7 ffmpeg7)
+	make_torchcodec_library(libtorchcodec6 ffmpeg6)
+	make_torchcodec_library(libtorchcodec5 ffmpeg5)
+
 else()
     message(
         STATUS
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1073,6 +1073,53 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
   return output;
 }
 
+VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesDisplayedByTimestamps(
+    int streamIndex,
+    const std::vector<double>& timestamps) {
+  validateUserProvidedStreamIndex(streamIndex);
+  validateScannedAllStreams("getFramesDisplayedByTimestamps");
+
+  // The frame displayed at timestamp t and the one displayed at timestamp `t +
+  // eps` are probably the same frame, with the same index. The easiest way to
+  // avoid decoding that unique frame twice is to convert the input timestamps
+  // to indices, and leverage the de-duplication logic of getFramesAtIndices.
+  // This means this function requires a scan.
+  // TODO: longer term, we should implement this without requiring a scan
+
+  const auto& streamMetadata = containerMetadata_.streams[streamIndex];
+  const auto& stream = streams_[streamIndex];
+  double minSeconds = streamMetadata.minPtsSecondsFromScan.value();
+  double maxSeconds = streamMetadata.maxPtsSecondsFromScan.value();
+
+  std::vector<int64_t> frameIndices(timestamps.size());
+  for (auto i = 0; i < timestamps.size(); ++i) {
+    auto framePts = timestamps[i];
+    TORCH_CHECK(
+        framePts >= minSeconds && framePts < maxSeconds,
+        "frame pts is " + std::to_string(framePts) + "; must be in range [" +
+            std::to_string(minSeconds) + ", " + std::to_string(maxSeconds) +
+            ").");
+
+    auto it = std::lower_bound(
+        stream.allFrames.begin(),
+        stream.allFrames.end(),
+        framePts,
+        [&stream](const FrameInfo& info, double framePts) {
+          return ptsToSeconds(info.nextPts, stream.timeBase) <= framePts;
+        });
+    int64_t frameIndex = it - stream.allFrames.begin();
+    // If the frame index is larger than the size of allFrames, that means we
+    // couldn't match the pts value to the pts value of a NEXT FRAME. And
+    // that means that this timestamp falls during the time between when the
+    // last frame is displayed, and the video ends. Hence, it should map to the
+    // index of the last frame.
+    frameIndex = std::min(frameIndex, (int64_t)stream.allFrames.size() - 1);
+    frameIndices[i] = frameIndex;
+  }
+
+  return getFramesAtIndices(streamIndex, frameIndices);
+}
+
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesInRange(
     int streamIndex,
     int64_t start,
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -223,6 +223,7 @@ class VideoDecoder {
   // i.e. it will be returned when this function is called with seconds=5.0 or
   // seconds=5.999, etc.
   DecodedOutput getFrameDisplayedAtTimestampNoDemux(double seconds);
+
   DecodedOutput getFrameAtIndex(
       int streamIndex,
       int64_t frameIndex,
@@ -242,6 +243,11 @@ class VideoDecoder {
   BatchDecodedOutput getFramesAtIndices(
       int streamIndex,
       const std::vector<int64_t>& frameIndices);
+
+  BatchDecodedOutput getFramesDisplayedByTimestamps(
+      int streamIndex,
+      const std::vector<double>& timestamps);
+
   // Returns frames within a given range for a given stream as a single stacked
   // Tensor. The range is defined by [start, stop). The values retrieved from
   // the range are:
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -45,6 +45,8 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "get_frames_in_range(Tensor(a!) decoder, *, int stream_index, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
   m.def(
       "get_frames_by_pts_in_range(Tensor(a!) decoder, *, int stream_index, float start_seconds, float stop_seconds) -> (Tensor, Tensor, Tensor)");
+  m.def(
+      "get_frames_by_pts(Tensor(a!) decoder, *, int stream_index, float[] timestamps) -> (Tensor, Tensor, Tensor)");
   m.def("get_json_metadata(Tensor(a!) decoder) -> str");
   m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
   m.def(
@@ -240,6 +242,16 @@ OpsBatchDecodedOutput get_frames_in_range(
       stream_index, start, stop, step.value_or(1));
   return makeOpsBatchDecodedOutput(result);
 }
+OpsBatchDecodedOutput get_frames_by_pts(
+    at::Tensor& decoder,
+    int64_t stream_index,
+    at::ArrayRef<double> timestamps) {
+  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
+  std::vector<double> timestampsVec(timestamps.begin(), timestamps.end());
+  auto result =
+      videoDecoder->getFramesDisplayedByTimestamps(stream_index, timestampsVec);
+  return makeOpsBatchDecodedOutput(result);
+}
 
 OpsBatchDecodedOutput get_frames_by_pts_in_range(
     at::Tensor& decoder,
@@ -479,6 +491,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("get_frames_at_indices", &get_frames_at_indices);
   m.impl("get_frames_in_range", &get_frames_in_range);
   m.impl("get_frames_by_pts_in_range", &get_frames_by_pts_in_range);
+  m.impl("get_frames_by_pts", &get_frames_by_pts);
   m.impl("_test_frame_pts_equality", &_test_frame_pts_equality);
   m.impl(
       "scan_all_streams_to_update_metadata",
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h
@@ -75,6 +75,12 @@ using OpsBatchDecodedOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
 // given timestamp T has T >= PTS and T < PTS + Duration.
 OpsDecodedOutput get_frame_at_pts(at::Tensor& decoder, double seconds);
 
+// Return the frames at given ptss for a given stream
+OpsBatchDecodedOutput get_frames_by_pts(
+    at::Tensor& decoder,
+    int64_t stream_index,
+    at::ArrayRef<double> timestamps);
+
 // Return the frame that is visible at a given index in the video.
 OpsDecodedOutput get_frame_at_index(
     at::Tensor& decoder,
@@ -85,8 +91,7 @@ OpsDecodedOutput get_frame_at_index(
 // duration as tensors.
 OpsDecodedOutput get_next_frame(at::Tensor& decoder);
 
-// Return the frames at a given index for a given stream as a single stacked
-// Tensor.
+// Return the frames at given indices for a given stream
 OpsBatchDecodedOutput get_frames_at_indices(
     at::Tensor& decoder,
     int64_t stream_index,
diff --git a/src/torchcodec/decoders/_core/__init__.py b/src/torchcodec/decoders/_core/__init__.py
@@ -22,6 +22,7 @@
     get_frame_at_index,
     get_frame_at_pts,
     get_frames_at_indices,
+    get_frames_by_pts,
     get_frames_by_pts_in_range,
     get_frames_in_range,
     get_json_metadata,
diff --git a/src/torchcodec/decoders/_core/video_decoder_ops.py b/src/torchcodec/decoders/_core/video_decoder_ops.py
@@ -71,6 +71,7 @@ def load_torchcodec_extension():
 get_frame_at_pts = torch.ops.torchcodec_ns.get_frame_at_pts.default
 get_frame_at_index = torch.ops.torchcodec_ns.get_frame_at_index.default
 get_frames_at_indices = torch.ops.torchcodec_ns.get_frames_at_indices.default
+get_frames_by_pts = torch.ops.torchcodec_ns.get_frames_by_pts.default
 get_frames_in_range = torch.ops.torchcodec_ns.get_frames_in_range.default
 get_frames_by_pts_in_range = torch.ops.torchcodec_ns.get_frames_by_pts_in_range.default
 get_json_metadata = torch.ops.torchcodec_ns.get_json_metadata.default
@@ -172,6 +173,21 @@ def get_frame_at_pts_abstract(
     )
 
 
+@register_fake("torchcodec_ns::get_frames_by_pts")
+def get_frames_by_pts_abstract(
+    decoder: torch.Tensor,
+    *,
+    stream_index: int,
+    timestamps: List[float],
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
+    return (
+        torch.empty(image_size),
+        torch.empty([], dtype=torch.float),
+        torch.empty([], dtype=torch.float),
+    )
+
+
 @register_fake("torchcodec_ns::get_frame_at_index")
 def get_frame_at_index_abstract(
     decoder: torch.Tensor, *, stream_index: int, frame_index: int
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
diff --git a/test/utils.py b/test/utils.py