Convert sample format, update tests

NicolasHug · NicolasHug · commit a0dcafd829cc · 2025-04-09T12:56:14.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -96,9 +96,6 @@ AudioEncoder::AudioEncoder(
   // may need to convert the wf into a supported output sample format, which is
   // what the `.sample_fmt` defines.
   avCodecContext_->sample_fmt = findOutputSampleFormat(*avCodec);
-  printf(
-      "Will be using: %s\n",
-      av_get_sample_fmt_name(avCodecContext_->sample_fmt));
 
   // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
   // planar (fltp).
@@ -143,6 +140,8 @@ AVSampleFormat AudioEncoder::findOutputSampleFormat(const AVCodec& avCodec) {
   // encoder. Right now, the output format we'll choose is just the first format
   // in the `sample_fmts` list that the AVCodec defines. Eventually, we may
   // allow the user to choose.
+  // TODO-ENCODING: a better default would probably be to choose the highest
+  // available precision
   if (avCodec.sample_fmts == nullptr) {
     // Can't really validate anything in this case, best we can do is hope that
     // FLTP is supported by the encoder. If not, FFmpeg will raise.
@@ -164,7 +163,7 @@ void AudioEncoder::encode() {
   int numSamplesAllocatedPerFrame =
       avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256;
   avFrame->nb_samples = numSamplesAllocatedPerFrame;
-  avFrame->format = avCodecContext_->sample_fmt;
+  avFrame->format = AV_SAMPLE_FMT_FLTP;
   avFrame->sample_rate = avCodecContext_->sample_rate;
   avFrame->pts = 0;
   setChannelLayout(avFrame, avCodecContext_);
@@ -230,7 +229,29 @@ void AudioEncoder::encode() {
 
 void AudioEncoder::encodeInnerLoop(
     AutoAVPacket& autoAVPacket,
-    const UniqueAVFrame& avFrame) {
+    const UniqueAVFrame& srcAVFrame) {
+  bool mustConvert =
+      (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP &&
+       srcAVFrame != nullptr);
+  UniqueAVFrame convertedAVFrame;
+  if (mustConvert) {
+    if (!swrContext_) {
+      swrContext_.reset(createSwrContext(
+          avCodecContext_,
+          AV_SAMPLE_FMT_FLTP,
+          avCodecContext_->sample_fmt,
+          srcAVFrame->sample_rate, // No sample rate conversion
+          srcAVFrame->sample_rate));
+    }
+    convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
+        swrContext_,
+        srcAVFrame,
+        avCodecContext_->sample_fmt,
+        srcAVFrame->sample_rate, // No sample rate conversion
+        srcAVFrame->sample_rate);
+  }
+  const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
+
   auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
   TORCH_CHECK(
       status == AVSUCCESS,
@@ -267,6 +288,9 @@ void AudioEncoder::encodeInnerLoop(
 }
 
 void AudioEncoder::flushBuffers() {
+  // We flush the main FFmpeg buffers, but not swresample buffers. Flushing
+  // swresample is only necessary when converting sample rates, which we don't
+  // do for encoding.
   AutoAVPacket autoAVPacket;
   encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
 }
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -26,13 +26,14 @@ class AudioEncoder {
  private:
   void encodeInnerLoop(
       AutoAVPacket& autoAVPacket,
-      const UniqueAVFrame& avFrame);
+      const UniqueAVFrame& srcAVFrame);
   void flushBuffers();
   AVSampleFormat findOutputSampleFormat(const AVCodec& avCodec);
 
   UniqueEncodingAVFormatContext avFormatContext_;
   UniqueAVCodecContext avCodecContext_;
   int streamIndex_;
+  UniqueSwrContext swrContext_;
 
   const torch::Tensor wf_;
 };
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -1122,34 +1122,35 @@ def test_bad_input(self, tmp_path):
                 bit_rate=-1,  # bad
             )
 
-    def test_round_trip(self, tmp_path):
-        # Check that decode(encode(samples)) == samples
+    @pytest.mark.parametrize("output_format", ("wav", "flac"))
+    def test_round_trip(self, output_format, tmp_path):
+        # Check that decode(encode(samples)) == samples on lossless formats
         asset = NASA_AUDIO_MP3
         source_samples = self.decode(asset)
 
-        encoded_path = tmp_path / "output.mp3"
+        encoded_path = tmp_path / f"output.{output_format}"
         encoder = create_audio_encoder(
             wf=source_samples, sample_rate=asset.sample_rate, filename=str(encoded_path)
         )
         encode_audio(encoder)
 
-        # TODO-ENCODING: tol should be stricter. We probably need to encode
-        # into a lossless format.
         torch.testing.assert_close(
-            self.decode(encoded_path), source_samples, rtol=0, atol=0.07
+            self.decode(encoded_path), source_samples, rtol=0, atol=1e-4
         )
 
-    # TODO-ENCODING: test more encoding formats
     @pytest.mark.skipif(in_fbcode(), reason="TODO: enable ffmpeg CLI")
     @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
     @pytest.mark.parametrize("bit_rate", (None, 0, 44_100, 999_999_999))
-    def test_against_cli(self, asset, bit_rate, tmp_path):
+    @pytest.mark.parametrize("output_format", ("mp3", "wav", "flac"))
+    def test_against_cli(self, asset, bit_rate, output_format, tmp_path):
         # Encodes samples with our encoder and with the FFmpeg CLI, and checks
         # that both decoded outputs are equal
 
-        encoded_by_ffmpeg = tmp_path / "ffmpeg_output.mp3"
-        encoded_by_us = tmp_path / "our_output.mp3"
+        encoded_by_ffmpeg = tmp_path / f"ffmpeg_output.{output_format}"
+        encoded_by_us = tmp_path / f"our_output.{output_format}"
 
+        # Note: output format may be different from ours, e.g. FFmpeg CLI would
+        # choose s32 while our current heuristic may choose s16.
         subprocess.run(
             ["ffmpeg", "-i", str(asset.path)]
             + (["-b:a", f"{bit_rate}"] if bit_rate is not None else [])
@@ -1169,7 +1170,10 @@ def test_against_cli(self, asset, bit_rate, tmp_path):
         encode_audio(encoder)
 
         torch.testing.assert_close(
-            self.decode(encoded_by_ffmpeg), self.decode(encoded_by_us)
+            self.decode(encoded_by_ffmpeg),
+            self.decode(encoded_by_us),
+            rtol=0,
+            atol=1e-4,
         )