-
Notifications
You must be signed in to change notification settings - Fork 35
AudioDecoder: specify desired num_channels #678
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
c8c1631
ccd0bfb
d29c3f8
7291b83
3aa6c65
65c8da1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,7 +81,6 @@ void setDefaultChannelLayout( | |
AVChannelLayout channel_layout; | ||
av_channel_layout_default(&channel_layout, numChannels); | ||
avCodecContext->ch_layout = channel_layout; | ||
|
||
#else | ||
uint64_t channel_layout = av_get_default_channel_layout(numChannels); | ||
avCodecContext->channel_layout = channel_layout; | ||
|
@@ -106,32 +105,80 @@ void setChannelLayout( | |
#endif | ||
} | ||
|
||
namespace { | ||
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4 | ||
|
||
// Returns: | ||
// - the srcAVFrame's channel layout if srcAVFrame has desiredNumChannels | ||
// - the default channel layout with desiredNumChannels otherwise. | ||
AVChannelLayout getDesiredChannelLayout( | ||
int desiredNumChannels, | ||
const UniqueAVFrame& srcAVFrame) { | ||
AVChannelLayout desiredLayout; | ||
if (desiredNumChannels == getNumChannels(srcAVFrame)) { | ||
desiredLayout = srcAVFrame->ch_layout; | ||
} else { | ||
av_channel_layout_default(&desiredLayout, desiredNumChannels); | ||
} | ||
return desiredLayout; | ||
} | ||
|
||
#else | ||
|
||
// Same as above | ||
int64_t getDesiredChannelLayout( | ||
int desiredNumChannels, | ||
const UniqueAVFrame& srcAVFrame) { | ||
int64_t desiredLayout; | ||
if (desiredNumChannels == getNumChannels(srcAVFrame)) { | ||
desiredLayout = srcAVFrame->channel_layout; | ||
} else { | ||
desiredLayout = av_get_default_channel_layout(desiredNumChannels); | ||
} | ||
return desiredLayout; | ||
} | ||
#endif | ||
} // namespace | ||
|
||
// Sets dstAVFrame' channel layout to getDesiredChannelLayout(): see doc above | ||
void setChannelLayout( | ||
UniqueAVFrame& dstAVFrame, | ||
const UniqueAVFrame& srcAVFrame) { | ||
const UniqueAVFrame& srcAVFrame, | ||
int desiredNumChannels) { | ||
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4 | ||
dstAVFrame->ch_layout = srcAVFrame->ch_layout; | ||
AVChannelLayout desiredLayout = | ||
getDesiredChannelLayout(desiredNumChannels, srcAVFrame); | ||
auto status = av_channel_layout_copy(&dstAVFrame->ch_layout, &desiredLayout); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"Couldn't copy channel layout to avFrame: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
#else | ||
dstAVFrame->channel_layout = srcAVFrame->channel_layout; | ||
dstAVFrame->channel_layout = | ||
getDesiredChannelLayout(desiredNumChannels, srcAVFrame); | ||
dstAVFrame->channels = desiredNumChannels; | ||
#endif | ||
} | ||
|
||
SwrContext* createSwrContext( | ||
UniqueAVCodecContext& avCodecContext, | ||
AVSampleFormat sourceSampleFormat, | ||
AVSampleFormat desiredSampleFormat, | ||
int sourceSampleRate, | ||
int desiredSampleRate) { | ||
int desiredSampleRate, | ||
const UniqueAVFrame& srcAVFrame, | ||
int desiredNumChannels) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. slightly changed the signature to take an AVFrame instead of the AVCodecContext. This is where we get the source num_channel and source channel layout. It's more accurate to get it from the AVFrame, although they should both always be the same. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: I prefer for the primary FFmpeg data structures to appear first in a function. I think of these functions as if they are methods on those structures, and that gets reinforced by making them the first parameter. I think I understand the current order, which There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that for functions that look like methods, we want the FFmpeg struct to be the first. The reason I removed the avCodecContext from the top and now use an AVFrame is precisely because this function shouldn't read as a method.
Yes, you're correct. Ideally, we shouldn't need to pass the AVFrame at all, we should just pass its Does that make sense? Basically: we're passing an AVFrame because of some FFmpeg version constraint, but really this just be "source channel layout", and thus this isn't a method on an AVFrame at all. |
||
SwrContext* swrContext = nullptr; | ||
int status = AVSUCCESS; | ||
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4 | ||
AVChannelLayout layout = avCodecContext->ch_layout; | ||
AVChannelLayout sourceLayout = srcAVFrame->ch_layout; | ||
AVChannelLayout desiredLayout = | ||
getDesiredChannelLayout(desiredNumChannels, srcAVFrame); | ||
status = swr_alloc_set_opts2( | ||
&swrContext, | ||
&layout, | ||
&desiredLayout, | ||
desiredSampleFormat, | ||
desiredSampleRate, | ||
&layout, | ||
&sourceLayout, | ||
sourceSampleFormat, | ||
sourceSampleRate, | ||
0, | ||
|
@@ -142,13 +189,14 @@ SwrContext* createSwrContext( | |
"Couldn't create SwrContext: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
#else | ||
int64_t layout = static_cast<int64_t>(avCodecContext->channel_layout); | ||
int64_t desiredLayout = | ||
getDesiredChannelLayout(desiredNumChannels, srcAVFrame); | ||
swrContext = swr_alloc_set_opts( | ||
nullptr, | ||
layout, | ||
desiredLayout, | ||
desiredSampleFormat, | ||
desiredSampleRate, | ||
layout, | ||
srcAVFrame->channel_layout, | ||
sourceSampleFormat, | ||
sourceSampleRate, | ||
0, | ||
|
@@ -167,20 +215,21 @@ SwrContext* createSwrContext( | |
return swrContext; | ||
} | ||
|
||
UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate( | ||
UniqueAVFrame convertAudioAVFrameSamples( | ||
const UniqueSwrContext& swrContext, | ||
const UniqueAVFrame& srcAVFrame, | ||
AVSampleFormat desiredSampleFormat, | ||
int sourceSampleRate, | ||
int desiredSampleRate) { | ||
int desiredSampleRate, | ||
int desiredNumChannels) { | ||
UniqueAVFrame convertedAVFrame(av_frame_alloc()); | ||
TORCH_CHECK( | ||
convertedAVFrame, | ||
"Could not allocate frame for sample format conversion."); | ||
|
||
setChannelLayout(convertedAVFrame, srcAVFrame); | ||
convertedAVFrame->format = static_cast<int>(desiredSampleFormat); | ||
|
||
convertedAVFrame->sample_rate = desiredSampleRate; | ||
int sourceSampleRate = srcAVFrame->sample_rate; | ||
if (sourceSampleRate != desiredSampleRate) { | ||
// Note that this is an upper bound on the number of output samples. | ||
// `swr_convert()` will likely not fill convertedAVFrame with that many | ||
|
@@ -200,6 +249,8 @@ UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate( | |
convertedAVFrame->nb_samples = srcAVFrame->nb_samples; | ||
} | ||
|
||
setChannelLayout(convertedAVFrame, srcAVFrame, desiredNumChannels); | ||
|
||
auto status = av_frame_get_buffer(convertedAVFrame.get(), 0); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,7 +40,7 @@ TORCH_LIBRARY(torchcodec_ns, m) { | |
m.def( | ||
"add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()"); | ||
m.def( | ||
"add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None) -> ()"); | ||
"add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()"); | ||
m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()"); | ||
m.def("get_next_frame(Tensor(a!) decoder) -> (Tensor, Tensor, Tensor)"); | ||
m.def( | ||
|
@@ -280,9 +280,11 @@ void add_video_stream( | |
void add_audio_stream( | ||
at::Tensor& decoder, | ||
std::optional<int64_t> stream_index = std::nullopt, | ||
std::optional<int64_t> sample_rate = std::nullopt) { | ||
std::optional<int64_t> sample_rate = std::nullopt, | ||
std::optional<int64_t> num_channels = std::nullopt) { | ||
AudioStreamOptions audioStreamOptions; | ||
audioStreamOptions.sampleRate = sample_rate; | ||
audioStreamOptions.numChannels = num_channels; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implicitly converts an optional int64 to an optional int. The same implicit conversion already happens for sample_rate, and for the VideoStreamOptions' fields like height, width, and ffmpegThreadCount. I'll open an issue to fix / check these all at once. |
||
|
||
auto videoDecoder = unwrapTensorToGetDecoder(decoder); | ||
videoDecoder->addAudioStream(stream_index.value_or(-1), audioStreamOptions); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I renamed the function, because it converts too many things to list all of them now. @scotts I also remember that you wanted to align all the
sourceStuff
tosrcStuff
- which I'll do as a quick follow-up!