diff --git a/benchmarks/decoders/benchmark_decoders.py b/benchmarks/decoders/benchmark_decoders.py index d29f8c82..b1dc46c5 100644 --- a/benchmarks/decoders/benchmark_decoders.py +++ b/benchmarks/decoders/benchmark_decoders.py @@ -14,10 +14,11 @@ plot_data, run_benchmarks, TorchAudioDecoder, - TorchcodecCompiled, - TorchCodecNonCompiledBatch, - TorchcodecNonCompiledWithOptions, - TVNewAPIDecoderWithBackend, + TorchCodecCoreCompiled, + TorchCodecCoreBatch, + TorchCodecCore, + TorchCodecPublic, + TorchVision, ) @@ -70,7 +71,7 @@ def main() -> None: "For torchcodec, you can specify options with tcoptions:. " ), type=str, - default="decord,tcoptions:,torchvision,torchaudio,torchcodec_compiled,tcoptions:num_threads=1", + default="decord,tcoptions:,torchvision,torchaudio,torchcodec_compiled,torchcodec_public,tcoptions:num_threads=1,tcbatchoptions:", ) parser.add_argument( "--bm_video_dir", @@ -98,14 +99,16 @@ def main() -> None: DecordNonBatchDecoderAccurateSeek() ) elif decoder == "torchcodec": - decoder_dict["TorchCodecNonCompiled"] = TorchcodecNonCompiledWithOptions() + decoder_dict["TorchCodecCore:"] = TorchCodecCore() elif decoder == "torchcodec_compiled": - decoder_dict["TorchcodecCompiled"] = TorchcodecCompiled() + decoder_dict["TorchCodecCoreCompiled"] = TorchCodecCoreCompiled() + elif decoder == "torchcodec_public": + decoder_dict["TorchCodecPublic"] = TorchCodecPublic() elif decoder == "torchvision": - decoder_dict["TVNewAPIDecoderWithBackendVideoReader"] = ( + decoder_dict["TorchVision[backend=video_reader]"] = ( # We don't compare TorchVision's "pyav" backend because it doesn't support # accurate seeks. - TVNewAPIDecoderWithBackend("video_reader") + TorchVision("video_reader") ) elif decoder == "torchaudio": decoder_dict["TorchAudioDecoder"] = TorchAudioDecoder() @@ -117,8 +120,8 @@ def main() -> None: continue k, v = item.split("=") kwargs_dict[k] = v - decoder_dict["TorchCodecNonCompiledBatch:" + options] = ( - TorchCodecNonCompiledBatch(**kwargs_dict) + decoder_dict["TorchCodecCoreBatch" + options] = ( + TorchCodecCoreBatch(**kwargs_dict) ) elif decoder.startswith("tcoptions:"): options = decoder[len("tcoptions:") :] @@ -128,8 +131,8 @@ def main() -> None: continue k, v = item.split("=") kwargs_dict[k] = v - decoder_dict["TorchcodecNonCompiled:" + options] = ( - TorchcodecNonCompiledWithOptions(**kwargs_dict) + decoder_dict["TorchCodecCore:" + options] = ( + TorchCodecCore(**kwargs_dict) ) video_paths = args.bm_video_paths.split(",") if args.bm_video_dir: @@ -142,8 +145,9 @@ def main() -> None: decoder_dict, video_paths, num_uniform_samples, - args.bm_video_speed_min_run_seconds, - args.bm_video_creation, + num_sequential_frames_from_start=[1, 10, 100], + min_runtime_seconds=args.bm_video_speed_min_run_seconds, + benchmark_video_creation=args.bm_video_creation, ) plot_data(df_data, args.plot_path) diff --git a/benchmarks/decoders/benchmark_decoders_library.py b/benchmarks/decoders/benchmark_decoders_library.py index 3347e103..11f4fe2e 100644 --- a/benchmarks/decoders/benchmark_decoders_library.py +++ b/benchmarks/decoders/benchmark_decoders_library.py @@ -12,7 +12,7 @@ import torch import torch.utils.benchmark as benchmark -from torchcodec.decoders import VideoDecoder +from torchcodec.decoders import VideoDecoder, VideoStreamMetadata from torchcodec.decoders._core import ( _add_video_stream, @@ -78,7 +78,7 @@ def get_consecutive_frames_from_video(self, video_file, numFramesToDecode): return frames -class TVNewAPIDecoderWithBackend(AbstractDecoder): +class TorchVision(AbstractDecoder): def __init__(self, backend): self._backend = backend self._print_each_iteration_time = False @@ -125,7 +125,7 @@ def get_consecutive_frames_from_video(self, video_file, numFramesToDecode): return frames -class TorchcodecNonCompiledWithOptions(AbstractDecoder): +class TorchCodecCore(AbstractDecoder): def __init__(self, num_threads=None, color_conversion_library=None, device="cpu"): self._print_each_iteration_time = False self._num_threads = int(num_threads) if num_threads else None @@ -186,7 +186,7 @@ def get_consecutive_frames_from_video(self, video_file, numFramesToDecode): return frames -class TorchCodecNonCompiledBatch(AbstractDecoder): +class TorchCodecCoreBatch(AbstractDecoder): def __init__(self, num_threads=None, color_conversion_library=None): self._print_each_iteration_time = False self._num_threads = int(num_threads) if num_threads else None @@ -227,6 +227,24 @@ def get_consecutive_frames_from_video(self, video_file, numFramesToDecode): ) return frames +class TorchCodecPublic(AbstractDecoder): + def __init__(self, num_ffmpeg_threads=None): + self._num_ffmpeg_threads = int(num_ffmpeg_threads) if num_ffmpeg_threads else None + + def get_frames_from_video(self, video_file, pts_list): + decoder = VideoDecoder(video_file, num_ffmpeg_threads=self._num_ffmpeg_threads) + return decoder.get_frames_played_at(pts_list) + + def get_consecutive_frames_from_video(self, video_file, numFramesToDecode): + decoder = VideoDecoder(video_file, num_ffmpeg_threads=self._num_ffmpeg_threads) + frames = [] + count = 0 + for frame in decoder: + frames.append(frame) + count += 1 + if count == numFramesToDecode: + break + return frames @torch.compile(fullgraph=True, backend="eager") def compiled_seek_and_next(decoder, pts): @@ -239,7 +257,7 @@ def compiled_next(decoder): return get_next_frame(decoder) -class TorchcodecCompiled(AbstractDecoder): +class TorchCodecCoreCompiled(AbstractDecoder): def __init__(self): pass @@ -450,62 +468,75 @@ def plot_data(df_data, plot_path): plot_path, ) +def get_metadata(video_file_path: str) -> VideoStreamMetadata: + return VideoDecoder(video_file_path).metadata def run_benchmarks( - decoder_dict, - video_files_paths, - num_uniform_samples, - min_runtime_seconds, - benchmark_video_creation, + decoder_dict: dict[str, AbstractDecoder], + video_files_paths: list[str], + num_samples: int, + num_sequential_frames_from_start: list[int], + min_runtime_seconds: float, + benchmark_video_creation: bool, ) -> list[dict[str, str | float | int]]: + # Ensure that we have the same seed across benchmark runs. + torch.manual_seed(0) + + print(f"video_files_paths={video_files_paths}") + results = [] df_data = [] - print(f"video_files_paths={video_files_paths}") verbose = False - for decoder_name, decoder in decoder_dict.items(): - for video_file_path in video_files_paths: + for video_file_path in video_files_paths: + metadata = get_metadata(video_file_path) + metadata_label = f"{metadata.codec} {metadata.width}x{metadata.height}, {metadata.duration_seconds}s {metadata.average_fps}fps" + + duration = metadata.duration_seconds + uniform_pts_list = [ + i * duration / num_samples for i in range(num_samples) + ] + + # Note that we are using the same random pts values for all decoders for the same + # video. However, because we use the duration as part of this calculation, we + # are using different random pts values across videos. + random_pts_list = (torch.rand(num_samples) * duration).tolist() + + for decoder_name, decoder in decoder_dict.items(): print(f"video={video_file_path}, decoder={decoder_name}") - # We only use the VideoDecoder to get the metadata and get - # the list of PTS values to seek to. - simple_decoder = VideoDecoder(video_file_path) - duration = simple_decoder.metadata.duration_seconds - pts_list = [ - i * duration / num_uniform_samples for i in range(num_uniform_samples) - ] - metadata = simple_decoder.metadata - metadata_string = f"{metadata.codec} {metadata.width}x{metadata.height}, {metadata.duration_seconds}s {metadata.average_fps}fps" - if verbose: - print( - f"video={video_file_path}, decoder={decoder_name}, pts_list={pts_list}" + + for kind, pts_list in [("uniform", uniform_pts_list), ("random", random_pts_list)]: + if verbose: + print( + f"video={video_file_path}, decoder={decoder_name}, pts_list={pts_list}" + ) + seeked_result = benchmark.Timer( + stmt="decoder.get_frames_from_video(video_file, pts_list)", + globals={ + "video_file": video_file_path, + "pts_list": pts_list, + "decoder": decoder, + }, + label=f"video={video_file_path} {metadata_label}", + sub_label=decoder_name, + description=f"{kind} {num_samples} seek()+next()", ) - seeked_result = benchmark.Timer( - stmt="decoder.get_frames_from_video(video_file, pts_list)", - globals={ - "video_file": video_file_path, - "pts_list": pts_list, - "decoder": decoder, - }, - label=f"video={video_file_path} {metadata_string}", - sub_label=decoder_name, - description=f"{num_uniform_samples} seek()+next()", - ) - results.append( - seeked_result.blocked_autorange(min_run_time=min_runtime_seconds) - ) - df_item = {} - df_item["decoder"] = decoder_name - df_item["video"] = video_file_path - df_item["description"] = results[-1].description - df_item["frame_count"] = num_uniform_samples - df_item["median"] = results[-1].median - df_item["iqr"] = results[-1].iqr - df_item["type"] = "seek()+next()" - df_item["fps"] = 1.0 * num_uniform_samples / results[-1].median - df_item["fps_p75"] = 1.0 * num_uniform_samples / results[-1]._p75 - df_item["fps_p25"] = 1.0 * num_uniform_samples / results[-1]._p25 - df_data.append(df_item) - - for num_consecutive_nexts in [1, 10]: + results.append( + seeked_result.blocked_autorange(min_run_time=min_runtime_seconds) + ) + df_item = {} + df_item["decoder"] = decoder_name + df_item["video"] = video_file_path + df_item["description"] = results[-1].description + df_item["frame_count"] = num_samples + df_item["median"] = results[-1].median + df_item["iqr"] = results[-1].iqr + df_item["type"] = f"{kind}:seek()+next()" + df_item["fps"] = 1.0 * num_samples / results[-1].median + df_item["fps_p75"] = 1.0 * num_samples / results[-1]._p75 + df_item["fps_p25"] = 1.0 * num_samples / results[-1]._p25 + df_data.append(df_item) + + for num_consecutive_nexts in num_sequential_frames_from_start: consecutive_frames_result = benchmark.Timer( stmt="decoder.get_consecutive_frames_from_video(video_file, consecutive_frames_to_extract)", globals={ @@ -513,7 +544,7 @@ def run_benchmarks( "consecutive_frames_to_extract": num_consecutive_nexts, "decoder": decoder, }, - label=f"video={video_file_path} {metadata_string}", + label=f"video={video_file_path} {metadata_label}", sub_label=decoder_name, description=f"{num_consecutive_nexts} next()", ) @@ -537,17 +568,16 @@ def run_benchmarks( first_video_file_path = video_files_paths[0] if benchmark_video_creation: - simple_decoder = VideoDecoder(first_video_file_path) - metadata = simple_decoder.metadata - metadata_string = f"{metadata.codec} {metadata.width}x{metadata.height}, {metadata.duration_seconds}s {metadata.average_fps}fps" + metadata = get_metadata(video_file_path) + metadata_label = f"{metadata.codec} {metadata.width}x{metadata.height}, {metadata.duration_seconds}s {metadata.average_fps}fps" creation_result = benchmark.Timer( stmt="create_torchcodec_decoder_from_file(video_file)", globals={ "video_file": first_video_file_path, "create_torchcodec_decoder_from_file": create_torchcodec_decoder_from_file, }, - label=f"video={first_video_file_path} {metadata_string}", - sub_label="TorchcodecNonCompiled", + label=f"video={first_video_file_path} {metadata_label}", + sub_label="TorchCodecCore:", description="create()+next()", ) results.append( diff --git a/benchmarks/decoders/benchmark_readme_chart.png b/benchmarks/decoders/benchmark_readme_chart.png index 6fb6c594..a1fd6b47 100644 Binary files a/benchmarks/decoders/benchmark_readme_chart.png and b/benchmarks/decoders/benchmark_readme_chart.png differ diff --git a/benchmarks/decoders/benchmark_readme_data.json b/benchmarks/decoders/benchmark_readme_data.json index c3759ea9..f5878a10 100644 --- a/benchmarks/decoders/benchmark_readme_data.json +++ b/benchmarks/decoders/benchmark_readme_data.json @@ -1,145 +1,145 @@ [ { "decoder": "TorchCodec", - "description": "10 seek()+next()", - "fps": 296.7497929852154, - "fps_p25": 304.82592121401444, - "fps_p75": 286.39866868882336, + "description": "uniform 10 seek()+next()", + "fps": 315.05924655387844, + "fps_p25": 323.6567367722293, + "fps_p75": 303.4217234978234, "frame_count": 10, - "iqr": 0.0021107543725520372, - "median": 0.03369842283427715, - "type": "seek()+next()", + "iqr": 0.0020604978781193495, + "median": 0.031740061938762665, + "type": "uniform:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchCodec", - "description": "1 next()", - "fps": 141.80053650468176, - "fps_p25": 144.12265279456386, - "fps_p75": 128.28507218641042, - "frame_count": 1, - "iqr": 0.0008566047297790648, - "median": 0.007052159495651722, - "type": "next()", + "description": "random 10 seek()+next()", + "fps": 312.8775910187949, + "fps_p25": 316.65411586172576, + "fps_p75": 304.67664202566044, + "frame_count": 10, + "iqr": 0.0012414834462106256, + "median": 0.03196138134226203, + "type": "random:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchCodec", - "description": "10 next()", - "fps": 638.7876251007046, - "fps_p25": 647.626546889273, - "fps_p75": 629.6538548699413, - "frame_count": 10, - "iqr": 0.00044074421748518944, - "median": 0.015654655173420906, + "description": "100 next()", + "fps": 948.3281062356014, + "fps_p25": 1211.2939721426944, + "fps_p75": 884.5162407819654, + "frame_count": 100, + "iqr": 0.030499806627631187, + "median": 0.10544873587787151, "type": "next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchCodec[num_threads=1]", - "description": "10 seek()+next()", - "fps": 126.6773946375974, - "fps_p25": 128.33021600580943, - "fps_p75": 124.2963412180317, + "description": "uniform 10 seek()+next()", + "fps": 131.3991915256709, + "fps_p25": 132.80657677770856, + "fps_p75": 130.0875105195256, "frame_count": 10, - "iqr": 0.00252892030403018, - "median": 0.07894068257883191, - "type": "seek()+next()", + "iqr": 0.0015738545916974545, + "median": 0.07610396901145577, + "type": "uniform:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchCodec[num_threads=1]", - "description": "1 next()", - "fps": 410.70043288059617, - "fps_p25": 416.39979027639464, - "fps_p75": 405.8298526819803, - "frame_count": 1, - "iqr": 6.25486485660077e-05, - "median": 0.0024348647333681584, - "type": "next()", + "description": "random 10 seek()+next()", + "fps": 131.76952597486445, + "fps_p25": 133.43827127306128, + "fps_p75": 129.72933699334547, + "frame_count": 10, + "iqr": 0.0021425478626042604, + "median": 0.0758900810033083, + "type": "random:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchCodec[num_threads=1]", - "description": "10 next()", - "fps": 758.7565583035099, - "fps_p25": 766.9872077345758, - "fps_p75": 751.3583938952637, - "frame_count": 10, - "iqr": 0.00027120066806674004, - "median": 0.013179457746446133, + "description": "100 next()", + "fps": 767.4262194030912, + "fps_p25": 954.5964481242703, + "fps_p75": 751.3156802763621, + "frame_count": 100, + "iqr": 0.028343535726889968, + "median": 0.13030568603426218, "type": "next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { - "decoder": "TorchVision[backend=VideoReader]", - "description": "10 seek()+next()", - "fps": 7.880664295730362, - "fps_p25": 7.924876540397429, - "fps_p75": 7.827668314297991, + "decoder": "TorchVision[backend=video_reader]", + "description": "uniform 10 seek()+next()", + "fps": 7.87802293808524, + "fps_p25": 7.9221952370727795, + "fps_p75": 7.816352619742458, "frame_count": 10, - "iqr": 0.01567032840102911, - "median": 1.2689285604283214, - "type": "seek()+next()", + "iqr": 0.017092708498239517, + "median": 1.2693540090695024, + "type": "uniform:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { - "decoder": "TorchVision[backend=VideoReader]", - "description": "1 next()", - "fps": 209.3834723742803, - "fps_p25": 211.77546088016425, - "fps_p75": 199.25812670019334, - "frame_count": 1, - "iqr": 0.0002966334810480479, - "median": 0.00477592614479363, - "type": "next()", + "decoder": "TorchVision[backend=video_reader]", + "description": "random 10 seek()+next()", + "fps": 7.257307771543773, + "fps_p25": 7.305370582206469, + "fps_p75": 7.209328679013935, + "frame_count": 10, + "iqr": 0.018235752126201987, + "median": 1.3779214434325695, + "type": "random:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { - "decoder": "TorchVision[backend=VideoReader]", - "description": "10 next()", - "fps": 531.7221665034224, - "fps_p25": 538.7259880033903, - "fps_p75": 522.8300241450709, - "frame_count": 10, - "iqr": 0.0005643628537654877, - "median": 0.018806814216077328, + "decoder": "TorchVision[backend=video_reader]", + "description": "100 next()", + "fps": 843.6237746060216, + "fps_p25": 852.9895625098609, + "fps_p75": 830.8058002517034, + "frame_count": 100, + "iqr": 0.0031303432770073414, + "median": 0.11853625159710646, "type": "next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchAudio", - "description": "10 seek()+next()", - "fps": 27.403418366590216, - "fps_p25": 27.57563496650987, - "fps_p75": 27.276339015442346, + "description": "uniform 10 seek()+next()", + "fps": 28.283386183212908, + "fps_p25": 28.48861769505288, + "fps_p75": 27.962052080094207, "frame_count": 10, - "iqr": 0.003979140194132924, - "median": 0.36491797724738717, - "type": "seek()+next()", + "iqr": 0.006610161624848843, + "median": 0.3535644542425871, + "type": "uniform:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchAudio", - "description": "1 next()", - "fps": 243.36030386646544, - "fps_p25": 246.1182470856226, - "fps_p75": 240.91573311529928, - "frame_count": 1, - "iqr": 8.774134330451558e-05, - "median": 0.004109133593738079, - "type": "next()", + "description": "random 10 seek()+next()", + "fps": 26.009247898010745, + "fps_p25": 26.13538377314808, + "fps_p75": 25.757236346453418, + "frame_count": 10, + "iqr": 0.00561736966483295, + "median": 0.38447863003239036, + "type": "random:seek()+next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, { "decoder": "TorchAudio", - "description": "10 next()", - "fps": 564.5307153840273, - "fps_p25": 572.2932075367765, - "fps_p75": 558.1691247911658, - "frame_count": 10, - "iqr": 0.00044215633533895016, - "median": 0.01771382801234722, + "description": "100 next()", + "fps": 659.7600283811723, + "fps_p25": 668.0071480761926, + "fps_p75": 652.052492189031, + "frame_count": 100, + "iqr": 0.003662889124825597, + "median": 0.15157026145607233, "type": "next()", "video": "/tmp/torchcodec_benchmarking_videos/640x480_10s_30fps_600gop_libx264_yuv420p.mp4" }, @@ -150,4 +150,4 @@ "python_version": "3.11.10", "system": "Linux" } -] +] \ No newline at end of file diff --git a/benchmarks/decoders/generate_readme_data.py b/benchmarks/decoders/generate_readme_data.py index 6fb859c0..bf8152a6 100644 --- a/benchmarks/decoders/generate_readme_data.py +++ b/benchmarks/decoders/generate_readme_data.py @@ -15,8 +15,8 @@ generate_videos, run_benchmarks, TorchAudioDecoder, - TorchcodecNonCompiledWithOptions, - TVNewAPIDecoderWithBackend, + TorchCodecPublic, + TorchVision, ) @@ -46,23 +46,24 @@ def main() -> None: video_files_paths = glob.glob(f"{videos_dir_path}/*.mp4") decoder_dict = {} - decoder_dict["TorchCodec"] = TorchcodecNonCompiledWithOptions() - decoder_dict["TorchCodec[num_threads=1]"] = TorchcodecNonCompiledWithOptions( - num_threads=1 + decoder_dict["TorchCodec"] = TorchCodecPublic() + decoder_dict["TorchCodec[num_threads=1]"] = TorchCodecPublic( + num_ffmpeg_threads=1 ) - decoder_dict["TorchVision[backend=VideoReader]"] = TVNewAPIDecoderWithBackend( + decoder_dict["TorchVision[backend=video_reader]"] = TorchVision( "video_reader" ) decoder_dict["TorchAudio"] = TorchAudioDecoder() # These are the number of uniform seeks we do in the seek+decode benchmark. - num_uniform_samples = 10 + num_samples = 10 df_data = run_benchmarks( decoder_dict, video_files_paths, - num_uniform_samples, - 10, - False, + num_samples, + num_sequential_frames_from_start=[100], + min_runtime_seconds=30, + benchmark_video_creation=False, ) df_data.append( {