Better default heuristic

NicolasHug · NicolasHug · commit 27fdbac28b82 · 2025-04-09T14:40:57.000+01:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -133,25 +133,38 @@ AudioEncoder::AudioEncoder(
 }
 
 AVSampleFormat AudioEncoder::findOutputSampleFormat(const AVCodec& avCodec) {
-  // Find a sample format that the encoder supports. If FLTP is supported then
-  // we use that, since this is the expected format of the input waveform.
-  // Otherwise, we'll need to convert the waveform before passing it to the
-  // encoder. Right now, the output format we'll choose is just the first format
-  // in the `sample_fmts` list that the AVCodec defines. Eventually, we may
-  // allow the user to choose.
-  // TODO-ENCODING: a better default would probably be to choose the highest
-  // available precision
+  // Find a sample format that the encoder supports. We prefer using FLT[P],
+  // since this is the format of the input waveform. If FLTP isn't supported
+  // then we'll need to convert the AVFrame's format. Our heuristic is to encode
+  // into the format with the highest resolution.
   if (avCodec.sample_fmts == nullptr) {
     // Can't really validate anything in this case, best we can do is hope that
     // FLTP is supported by the encoder. If not, FFmpeg will raise.
     return AV_SAMPLE_FMT_FLTP;
   }
 
-  for (auto i = 0; avCodec.sample_fmts[i] != -1; ++i) {
-    if (avCodec.sample_fmts[i] == AV_SAMPLE_FMT_FLTP) {
-      return AV_SAMPLE_FMT_FLTP;
+  std::vector<AVSampleFormat> preferredFormatsOrder = {
+      AV_SAMPLE_FMT_FLTP,
+      AV_SAMPLE_FMT_FLT,
+      AV_SAMPLE_FMT_DBLP,
+      AV_SAMPLE_FMT_DBL,
+      AV_SAMPLE_FMT_S64P,
+      AV_SAMPLE_FMT_S64,
+      AV_SAMPLE_FMT_S32P,
+      AV_SAMPLE_FMT_S32,
+      AV_SAMPLE_FMT_S16P,
+      AV_SAMPLE_FMT_S16,
+      AV_SAMPLE_FMT_U8P,
+      AV_SAMPLE_FMT_U8};
+
+  for (AVSampleFormat preferredFormat : preferredFormatsOrder) {
+    for (auto i = 0; avCodec.sample_fmts[i] != -1; ++i) {
+      if (avCodec.sample_fmts[i] == preferredFormat) {
+        return preferredFormat;
+      }
     }
   }
+  // Should never happen, but just in case
   return avCodec.sample_fmts[0];
 }
 
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -1139,8 +1139,9 @@ def test_round_trip(self, output_format, tmp_path):
         )
         encode_audio(encoder)
 
+        rtol, atol = (0, 1e-4) if output_format == "wav" else (None, None)
         torch.testing.assert_close(
-            self.decode(encoded_path), source_samples, rtol=0, atol=1e-4
+            self.decode(encoded_path), source_samples, rtol=rtol, atol=atol
         )
 
     @pytest.mark.skipif(in_fbcode(), reason="TODO: enable ffmpeg CLI")
@@ -1157,8 +1158,6 @@ def test_against_cli(self, asset, bit_rate, output_format, tmp_path):
         encoded_by_ffmpeg = tmp_path / f"ffmpeg_output.{output_format}"
         encoded_by_us = tmp_path / f"our_output.{output_format}"
 
-        # Note: output format may be different from ours, e.g. FFmpeg CLI would
-        # choose s32 while our current heuristic may choose s16.
         subprocess.run(
             ["ffmpeg", "-i", str(asset.path)]
             + (["-b:a", f"{bit_rate}"] if bit_rate is not None else [])
@@ -1177,11 +1176,12 @@ def test_against_cli(self, asset, bit_rate, output_format, tmp_path):
         )
         encode_audio(encoder)
 
+        rtol, atol = (0, 1e-4) if output_format == "wav" else (None, None)
         torch.testing.assert_close(
             self.decode(encoded_by_ffmpeg),
             self.decode(encoded_by_us),
-            rtol=0,
-            atol=1e-4,
+            rtol=rtol,
+            atol=atol,
         )