content/renderer/media/audio_track_recorder.cc - Issue 1579693006: MediaRecorder: support sampling rate adaption in AudioTrackRecorder

Unified Diff: content/renderer/media/audio_track_recorder.cc

Issue 1579693006: MediaRecorder: support sampling rate adaption in AudioTrackRecorder (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: miu@s second round of comments Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « content/renderer/media/audio_track_recorder.h ('k') | content/renderer/media/audio_track_recorder_unittest.cc » ('j') | content/renderer/media/audio_track_recorder_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/renderer/media/audio_track_recorder.cc

diff --git a/content/renderer/media/audio_track_recorder.cc b/content/renderer/media/audio_track_recorder.cc

index 05107b1257cbdec9cac88b965f08b24c99509faa..3de6fd60548cd737d46c47d1f234437e25f9d4d6 100644

--- a/content/renderer/media/audio_track_recorder.cc

+++ b/content/renderer/media/audio_track_recorder.cc

@@ -12,38 +12,101 @@

#include "base/stl_util.h"

#include "media/audio/audio_parameters.h"

#include "media/base/audio_bus.h"

+#include "media/base/audio_converter.h"

+#include "media/base/audio_fifo.h"

#include "media/base/bind_to_current_loop.h"

#include "third_party/opus/src/include/opus.h"

// Note that this code follows the Chrome media convention of defining a "frame"

-// as "one multi-channel sample" as opposed to another common definition

-// meaning "a chunk of samples". Here this second definition of "frame" is

-// called a "buffer"; so what might be called "frame duration" is instead

-// "buffer duration", and so on.

+// as "one multi-channel sample" as opposed to another common definition meaning

+// "a chunk of samples". Here this second definition of "frame" is called a

+// "buffer"; so what might be called "frame duration" is instead "buffer

+// duration", and so on.

namespace content {

namespace {

-enum {

- // This is the recommended value, according to documentation in

+enum : int {

+ // Recommended value for opus_encode_float(), according to documentation in

// third_party/opus/src/include/opus.h, so that the Opus encoder does not

- // degrade the audio due to memory constraints.

- OPUS_MAX_PAYLOAD_SIZE = 4000,

- // Support for max sampling rate of 48KHz, 2 channels, 60 ms duration.

- MAX_SAMPLES_PER_BUFFER = 48 * 2 * 60,

+ // degrade the audio due to memory constraints, and is independent of the

+ // duration of the encoded buffer.

+ kOpusMaxDataBytes = 4000,

+ // Opus preferred sampling rate for encoding. This is also the one WebM likes

+ // to have: https://wiki.xiph.org/MatroskaOpus.

+ kOpusPreferredSamplingRate = 48000,

+ // Media Stream Audio Tracks always send 10ms worth of Audio.

+ kMediaStreamTrackBufferDurationMs = 10,

+ // For quality reasons we try to encode 60ms, the maximum Opus buffer.

+ kOpusPreferredBufferDurationMs = 60,

};

+// Conversion between buffers following a N:1 length ratio is much easier, as is

+// the case here. This parameter represents that ratio: need N input buffers for

+// 1 output buffer.

+static const int kRatioInputToOutputBuffers =

+ kOpusPreferredBufferDurationMs / kMediaStreamTrackBufferDurationMs;

+// The amount of Frames in a 60 ms buffer @ 48000 samples/second.

+static const int kOpusPreferredFramesPerBuffer =

miu 2016/01/29 02:37:15 nit: No need to use the 'static' keyword inside th

mcasas 2016/01/29 20:37:34 Done.

+ kOpusPreferredSamplingRate * kOpusPreferredBufferDurationMs /

+ base::Time::kMillisecondsPerSecond;

+// Maximum amount of buffers that can be held in the AudioFifo of AudioEncoder.

+// Recording is not real time, hence a certain buffering is allowed.

+static const size_t kMaxNumberOfFifoBuffers = 2 * kRatioInputToOutputBuffers;

+bool DoEncode(OpusEncoder* opus_encoder,

+ float* data_in,

+ int num_samples,

+ std::string* data_out) {

+ DCHECK_EQ(kOpusPreferredFramesPerBuffer, num_samples);

+ data_out->resize(kOpusMaxDataBytes);

+ const opus_int32 result = opus_encode_float(

+ opus_encoder, data_in, num_samples,

+ reinterpret_cast<uint8_t*>(string_as_array(data_out)), kOpusMaxDataBytes);

+ if (result > 1) {

+ // TODO(ajose): Investigate improving this. http://crbug.com/547918

+ data_out->resize(result);

+ return true;

+ }

+ // If |result| in {0,1}, do nothing; the documentation says that a return

+ // value of zero or one means the packet does not need to be transmitted.

+ // Otherwise, we have an error.

+ DLOG_IF(ERROR, result < 0) << " encode failed: " << opus_strerror(result);

+ return false;

+// Interleaves |audio_bus| channels() of floats into a single output linear

+// |buffer|.

+// TODO(mcasas) https://crbug.com/580391 use AudioBus::ToInterleavedFloat().

+void ToInterleaved(media::AudioBus* audio_bus, float* buffer) {

+ for (int ch = 0; ch < audio_bus->channels(); ++ch) {

+ const float* src = audio_bus->channel(ch);

+ const float* const src_end = src + audio_bus->frames();

+ float* dest = buffer + ch;

+ for (; src < src_end; ++src, dest += audio_bus->channels())

+ *dest = *src;

+ }

} // anonymous namespace

-// Nested class encapsulating opus-related encoding details.

-// AudioEncoder is created and destroyed on ATR's main thread (usually the

-// main render thread) but otherwise should operate entirely on

-// |encoder_thread_|, which is owned by AudioTrackRecorder. Be sure to delete

-// |encoder_thread_| before deleting the AudioEncoder using it.

+// Nested class encapsulating opus-related encoding details. It contains an

+// AudioConverter to adapt incoming data to the format Opus likes to have.

+// AudioEncoder is created and destroyed on ATR's main thread (usually the main

+// render thread) but otherwise should operate entirely on |encoder_thread_|,

+// which is owned by AudioTrackRecorder. Be sure to delete |encoder_thread_|

+// before deleting the AudioEncoder using it.

class AudioTrackRecorder::AudioEncoder

- : public base::RefCountedThreadSafe<AudioEncoder> {

+ : public base::RefCountedThreadSafe<AudioEncoder>,

+ public media::AudioConverter::InputCallback {

public:

AudioEncoder(const OnEncodedAudioCB& on_encoded_audio_cb,

int32_t bits_per_second);

@@ -56,17 +119,15 @@ class AudioTrackRecorder::AudioEncoder

private:

friend class base::RefCountedThreadSafe<AudioEncoder>;

- ~AudioEncoder();

+ ~AudioEncoder() override;

bool is_initialized() const { return !!opus_encoder_; }

- void DestroyExistingOpusEncoder();

+ // media::AudioConverted::InputCallback implementation.

+ double ProvideInput(media::AudioBus* audio_bus,

+ base::TimeDelta buffer_delay) override;

- void TransferSamplesIntoBuffer(const media::AudioBus* audio_bus,

- int source_offset,

- int buffer_fill_offset,

- int num_samples);

- bool EncodeFromFilledBuffer(std::string* out);

+ void DestroyExistingOpusEncoder();

const OnEncodedAudioCB on_encoded_audio_cb_;

@@ -75,17 +136,15 @@ class AudioTrackRecorder::AudioEncoder

base::ThreadChecker encoder_thread_checker_;

- // In the case where a call to EncodeAudio() cannot completely fill the

- // buffer, this points to the position at which to populate data in a later

- // call.

- int buffer_fill_end_;

+ // Track Audio (ingress) and Opus encoder input parameters, respectively. They

+ // only differ in their sample_rate() and frames_per_buffer(): output is

+ // 48ksamples/s and 2880, respectively.

+ media::AudioParameters input_params_;

+ media::AudioParameters output_params_;

- int frames_per_buffer_;

- // The duration of one set of frames of encoded audio samples.

- base::TimeDelta buffer_duration_;

- media::AudioParameters audio_params_;

+ // Sampling rate adapter between an OpusEncoder supported and the provided.

+ scoped_ptr<media::AudioConverter> converter_;

+ scoped_ptr<media::AudioFifo> fifo_;

// Buffer for passing AudioBus data to OpusEncoder.

scoped_ptr<float[]> buffer_;

@@ -115,50 +174,56 @@ AudioTrackRecorder::AudioEncoder::~AudioEncoder() {

}

void AudioTrackRecorder::AudioEncoder::OnSetFormat(

- const media::AudioParameters& params) {

+ const media::AudioParameters& input_params) {

+ DVLOG(1) << __FUNCTION__;

DCHECK(encoder_thread_checker_.CalledOnValidThread());

- if (audio_params_.Equals(params))

+ if (input_params_.Equals(input_params))

return;

DestroyExistingOpusEncoder();

- if (!params.IsValid() || params.channels() > 2) {

- DLOG(ERROR) << "Invalid audio params: " << params.AsHumanReadableString();

+ if (!input_params.IsValid()) {

+ DLOG(ERROR) << "Invalid params: " << input_params.AsHumanReadableString();

return;

}

- buffer_duration_ = base::TimeDelta::FromMilliseconds(

- AudioTrackRecorder::GetOpusBufferDuration(params.sample_rate()));

- if (buffer_duration_ == base::TimeDelta()) {

- DLOG(ERROR) << "Could not find a valid |buffer_duration| for the given "

- << "sample rate: " << params.sample_rate();

- return;

- }

- frames_per_buffer_ =

- params.sample_rate() * buffer_duration_.InMilliseconds() / 1000;

- if (frames_per_buffer_ * params.channels() > MAX_SAMPLES_PER_BUFFER) {

- DLOG(ERROR) << "Invalid |frames_per_buffer_|: " << frames_per_buffer_;

- return;

- }

- // Initialize AudioBus buffer for OpusEncoder.

- buffer_fill_end_ = 0;

- buffer_.reset(new float[params.channels() * frames_per_buffer_]);

+ input_params_ = input_params;

+ input_params_.set_frames_per_buffer(input_params_.sample_rate() *

+ kOpusPreferredBufferDurationMs /

+ base::Time::kMillisecondsPerSecond);

+ // third_party/libopus supports up to 2 channels (see implementation of

+ // opus_encoder_create()): force |output_params_| to at most those.

+ output_params_ = media::AudioParameters(

+ media::AudioParameters::AUDIO_PCM_LOW_LATENCY,

+ media::GuessChannelLayout(std::min(input_params_.channels(), 2)),

+ kOpusPreferredSamplingRate,

+ input_params_.bits_per_sample(),

+ kOpusPreferredFramesPerBuffer);

+ DVLOG(1) << "|input_params_|:" << input_params_.AsHumanReadableString()

+ << " -->|output_params_|:" << output_params_.AsHumanReadableString();

+ converter_.reset(new media::AudioConverter(input_params_, output_params_,

+ false /* disable_fifo */));

+ converter_->AddInput(this);

+ converter_->PrimeWithSilence();

+ fifo_.reset(new media::AudioFifo(

+ input_params_.channels(),

+ kMaxNumberOfFifoBuffers * input_params_.frames_per_buffer()));

miu 2016/01/29 02:37:15 I think the second argument to the ctor here is si

mcasas 2016/01/29 20:37:34 I'm leaving the constant kMaxNumberOfFifoBuffers f

+ buffer_.reset(new float[output_params_.channels() *

+ output_params_.frames_per_buffer()]);

// Initialize OpusEncoder.

- DCHECK((params.sample_rate() != 48000) || (params.sample_rate() != 24000) ||

- (params.sample_rate() != 16000) || (params.sample_rate() != 12000) ||

- (params.sample_rate() != 8000))

- << "Opus supports only sample rates of {48, 24, 16, 12, 8}000, requested "

- << params.sample_rate();

int opus_result;

- opus_encoder_ = opus_encoder_create(params.sample_rate(), params.channels(),

- OPUS_APPLICATION_AUDIO, &opus_result);

+ opus_encoder_ = opus_encoder_create(output_params_.sample_rate(),

+ output_params_.channels(),

+ OPUS_APPLICATION_AUDIO,

+ &opus_result);

if (opus_result < 0) {

DLOG(ERROR) << "Couldn't init opus encoder: " << opus_strerror(opus_result)

- << ", sample rate: " << params.sample_rate()

- << ", channels: " << params.channels();

+ << ", sample rate: " << output_params_.sample_rate()

+ << ", channels: " << output_params_.channels();

return;

}

@@ -172,48 +237,61 @@ void AudioTrackRecorder::AudioEncoder::OnSetFormat(

DLOG(ERROR) << "Failed to set opus bitrate: " << bitrate;

return;

}

- audio_params_ = params;

}

void AudioTrackRecorder::AudioEncoder::EncodeAudio(

- scoped_ptr<media::AudioBus> audio_bus,

+ scoped_ptr<media::AudioBus> input_bus,

const base::TimeTicks& capture_time) {

+ DVLOG(1) << __FUNCTION__ << ", #frames " << input_bus->frames();

DCHECK(encoder_thread_checker_.CalledOnValidThread());

- DCHECK_EQ(audio_bus->channels(), audio_params_.channels());

+ DCHECK_EQ(input_bus->channels(), input_params_.channels());

+ DCHECK_EQ(input_bus->frames(), input_params_.sample_rate() *

miu 2016/01/29 02:37:15 You don't need this DCHECK(). AudioFifo::Push() w

mcasas 2016/01/29 20:37:34 Done.

+ kMediaStreamTrackBufferDurationMs /

+ base::Time::kMillisecondsPerSecond);

+ DCHECK(!capture_time.is_null());

+ DCHECK(converter_);

if (!is_initialized())

return;

- base::TimeDelta buffer_fill_duration =

- buffer_fill_end_ * buffer_duration_ / frames_per_buffer_;

- base::TimeTicks buffer_capture_time = capture_time - buffer_fill_duration;

- // Encode all audio in |audio_bus| into zero or more packets.

- int src_pos = 0;

- while (src_pos < audio_bus->frames()) {

- const int num_samples_to_xfer = std::min(

- frames_per_buffer_ - buffer_fill_end_, audio_bus->frames() - src_pos);

- TransferSamplesIntoBuffer(audio_bus.get(), src_pos, buffer_fill_end_,

- num_samples_to_xfer);

- src_pos += num_samples_to_xfer;

- buffer_fill_end_ += num_samples_to_xfer;

- if (buffer_fill_end_ < frames_per_buffer_)

- break;

+ // TODO(mcasas): Consider using a std::deque<scoped_ptr<AudioBus>> instead of

+ // an AudioFifo, to avoid copying data needlessly since we know the sizes of

+ // both input and output and they are multiples.

+ fifo_->Push(input_bus.get());

+ // Wait to have enough |input_bus|s queued up to guarantee a satisfactory

+ // conversion. Luckily here there is an integerkRatioInputToOutputBuffers:1

miu 2016/01/29 02:37:15 Please delete the second sentence of this comment

mcasas 2016/01/29 20:37:34 Done.

+ // ratio, possible since all buffers are multiples of 10ms.

+ while (fifo_->frames() >= input_params_.frames_per_buffer()) {

+ scoped_ptr<media::AudioBus> audio_bus = media::AudioBus::Create(

+ output_params_.channels(), kOpusPreferredFramesPerBuffer);

+ converter_->Convert(audio_bus.get());

+ ToInterleaved(audio_bus.release(), buffer_.get());

scoped_ptr<std::string> encoded_data(new std::string());

- if (EncodeFromFilledBuffer(encoded_data.get())) {

- on_encoded_audio_cb_.Run(audio_params_, std::move(encoded_data),

- buffer_capture_time);

+ if (DoEncode(opus_encoder_, buffer_.get(), kOpusPreferredFramesPerBuffer,

+ encoded_data.get())) {

+ const base::TimeTicks capture_time_of_first_sample =

+ capture_time -

+ base::TimeDelta::FromMicroseconds(fifo_->frames() *

+ base::Time::kMicrosecondsPerSecond /

+ input_params_.sample_rate());

+ on_encoded_audio_cb_.Run(output_params_, std::move(encoded_data),

+ capture_time_of_first_sample);

}

- // Reset the capture timestamp and internal buffer for next set of frames.

- buffer_capture_time += buffer_duration_;

- buffer_fill_end_ = 0;

}

+double AudioTrackRecorder::AudioEncoder::ProvideInput(

+ media::AudioBus* audio_bus,

+ base::TimeDelta buffer_delay) {

+ if (fifo_->frames() >= audio_bus->frames())

miu 2016/01/29 02:37:15 AudioFifo will CHECK that there are sufficient fra

mcasas 2016/01/29 20:37:34 Done.

+ fifo_->Consume(audio_bus, 0, audio_bus->frames());

+ else

+ audio_bus->Zero();

+ // Return volume greater than zero to indicate we have more data.

+ return 1.0;

void AudioTrackRecorder::AudioEncoder::DestroyExistingOpusEncoder() {

// We don't DCHECK that we're on the encoder thread here, as this could be

// called from the dtor (main thread) or from OnSetForamt() (render thread);

@@ -223,48 +301,6 @@ void AudioTrackRecorder::AudioEncoder::DestroyExistingOpusEncoder() {

}

-void AudioTrackRecorder::AudioEncoder::TransferSamplesIntoBuffer(

- const media::AudioBus* audio_bus,

- int source_offset,

- int buffer_fill_offset,

- int num_samples) {

- // TODO(ajose): Consider replacing with AudioBus::ToInterleaved().

- // http://crbug.com/547918

- DCHECK(encoder_thread_checker_.CalledOnValidThread());

- DCHECK(is_initialized());

- // Opus requires channel-interleaved samples in a single array.

- for (int ch = 0; ch < audio_bus->channels(); ++ch) {

- const float* src = audio_bus->channel(ch) + source_offset;

- const float* const src_end = src + num_samples;

- float* dest =

- buffer_.get() + buffer_fill_offset * audio_params_.channels() + ch;

- for (; src < src_end; ++src, dest += audio_params_.channels())

- *dest = *src;

- }

-bool AudioTrackRecorder::AudioEncoder::EncodeFromFilledBuffer(

- std::string* out) {

- DCHECK(encoder_thread_checker_.CalledOnValidThread());

- DCHECK(is_initialized());

- out->resize(OPUS_MAX_PAYLOAD_SIZE);

- const opus_int32 result = opus_encode_float(

- opus_encoder_, buffer_.get(), frames_per_buffer_,

- reinterpret_cast<uint8_t*>(string_as_array(out)), OPUS_MAX_PAYLOAD_SIZE);

- if (result > 1) {

- // TODO(ajose): Investigate improving this. http://crbug.com/547918

- out->resize(result);

- return true;

- }

- // If |result| in {0,1}, do nothing; the documentation says that a return

- // value of zero or one means the packet does not need to be transmitted.

- // Otherwise, we have an error.

- DLOG_IF(ERROR, result < 0) << __FUNCTION__

- << " failed: " << opus_strerror(result);

- return false;

AudioTrackRecorder::AudioTrackRecorder(

const blink::WebMediaStreamTrack& track,

const OnEncodedAudioCB& on_encoded_audio_cb,

@@ -316,22 +352,4 @@ void AudioTrackRecorder::OnData(const media::AudioBus& audio_bus,

base::Passed(&audio_data), capture_time));

}

-int AudioTrackRecorder::GetOpusBufferDuration(int sample_rate) {

- // Valid buffer durations in millseconds. Note there are other valid

- // durations for Opus, see https://tools.ietf.org/html/rfc6716#section-2.1.4

- // Descending order as longer durations can increase compression performance.

- const std::vector<int> opus_valid_buffer_durations_ms = {60, 40, 20, 10};

- // Search for a duration such that |sample_rate| % |buffers_per_second| == 0,

- // where |buffers_per_second| = 1000ms / |possible_duration|.

- for (auto possible_duration : opus_valid_buffer_durations_ms) {

- if (sample_rate * possible_duration % 1000 == 0) {

- return possible_duration;

- }

- // Otherwise, couldn't find a good duration.

- return 0;

} // namespace content