content/renderer/media/speech_recognition_audio_sink_unittest.cc - Issue 499233003: Binding media stream audio track to speech recognition [renderer]

Unified Diff: content/renderer/media/speech_recognition_audio_sink_unittest.cc

Issue 499233003: Binding media stream audio track to speech recognition [renderer] (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Nits, comments, refactoring, rebasing. Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« content/renderer/media/speech_recognition_audio_sink.cc ('K') | « content/renderer/media/speech_recognition_audio_sink.cc ('k') | content/renderer/speech_recognition_dispatcher.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/renderer/media/speech_recognition_audio_sink_unittest.cc

diff --git a/content/renderer/media/speech_recognition_audio_sink_unittest.cc b/content/renderer/media/speech_recognition_audio_sink_unittest.cc

new file mode 100644

index 0000000000000000000000000000000000000000..387d3ea895ab809cb16c3d569a9b638ddaab856e

--- /dev/null

+++ b/content/renderer/media/speech_recognition_audio_sink_unittest.cc

@@ -0,0 +1,466 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "content/renderer/media/speech_recognition_audio_sink.h"

+#include "base/strings/utf_string_conversions.h"

+#include "content/renderer/media/mock_media_constraint_factory.h"

+#include "content/renderer/media/webrtc/webrtc_local_audio_track_adapter.h"

+#include "content/renderer/media/webrtc_local_audio_track.h"

+#include "media/audio/audio_parameters.h"

+#include "media/base/audio_bus.h"

+#include "testing/gmock/include/gmock/gmock.h"

+#include "testing/gtest/include/gtest/gtest.h"

+#include "third_party/WebKit/public/platform/WebMediaStreamTrack.h"

+namespace {

+// Supported speech recognition audio parameters.

+const int kSpeechRecognitionSampleRate = 16000;

+const int kSpeechRecognitionFramesPerBuffer = 1600;

+// Input audio format.

+const media::AudioParameters::Format kInputFormat =

+ media::AudioParameters::AUDIO_PCM_LOW_LATENCY;

+const media::ChannelLayout kInputChannelLayout = media::CHANNEL_LAYOUT_MONO;

+const int kInputChannels = 1;

+const int kInputBitsPerSample = 16;

+// Output audio format.

+const media::AudioParameters::Format kOutputFormat =

+ media::AudioParameters::AUDIO_PCM_LOW_LATENCY;

+const media::ChannelLayout kOutputChannelLayout = media::CHANNEL_LAYOUT_STEREO;

+const int kOutputChannels = 2;

+const int kOutputBitsPerSample = 16;

+// Mocked out sockets used for Send/Receive.

+// Data is written and read from a shared buffer used as a FIFO and there is

+// no blocking. |OnSendCB| is used to trigger a |Receive| on the other socket.

+class MockSyncSocket : public base::SyncSocket {

+ public:

+ // This allows for 2 requests in queue between the |MockSyncSocket|s.

+ static const int kSharedBufferSize = 8;

+ // Buffer to be shared between two |MockSyncSocket|s. Allocated on heap.

+ struct SharedBuffer {

+ SharedBuffer() : data(), start(0), length(0) {}

+ uint8 data[kSharedBufferSize];

+ size_t start;

+ size_t length;

+ };

+ // Callback used for pairing an A.Send() with B.Receieve() without blocking.

+ typedef base::Callback<void()> OnSendCB;

+ explicit MockSyncSocket(SharedBuffer* shared_buffer)

+ : buffer_(shared_buffer),

+ in_failure_mode_(false) {}

+ MockSyncSocket(SharedBuffer* shared_buffer, const OnSendCB& on_send_cb)

+ : buffer_(shared_buffer),

+ on_send_cb_(on_send_cb),

+ in_failure_mode_(false) {}

+ virtual size_t Send(const void* buffer, size_t length) OVERRIDE;

+ virtual size_t Receive(void* buffer, size_t length) OVERRIDE;

+ // When |in_failure_mode_| == true, the socket fails to send.

+ void SetFailureMode(bool in_failure_mode) {

+ in_failure_mode_ = in_failure_mode;

+ }

+ private:

+ SharedBuffer* buffer_;

+ const OnSendCB on_send_cb_;

+ bool in_failure_mode_;

+};

+size_t MockSyncSocket::Send(const void* buffer, size_t length) {

+ if (in_failure_mode_)

+ return 0;

+ const uint8* b = static_cast<const uint8*>(buffer);

+ for (size_t i = 0; i < length; ++i, ++buffer_->length)

+ buffer_->data[buffer_->start + buffer_->length] = b[i];

+ on_send_cb_.Run();

+ return length;

+size_t MockSyncSocket::Receive(void* buffer, size_t length) {

+ uint8* b = static_cast<uint8*>(buffer);

+ for (size_t i = buffer_->start; i < buffer_->length; ++i, ++buffer_->start)

+ b[i] = buffer_->data[buffer_->start];

+ // Since buffer is used sequentially, we can reset the buffer indices here.

+ buffer_->start = buffer_->length = 0;

+ return length;

+// This fake class is the consumer used to verify behaviour of the producer.

+// The |Initialize()| method shows what the consumer should be responsible for

+// in the production code (minus the mocks).

+class FakeSpeechRecognizer {

+ public:

+ FakeSpeechRecognizer() : is_responsive_(true) { }

+ void Initialize(

+ const blink::WebMediaStreamTrack& track,

+ const media::AudioParameters& sink_params,

+ base::SharedMemoryHandle* foreign_memory_handle) {

+ // Shared memory is allocated, mapped and shared.

+ uint32 shared_memory_size =

+ sizeof(media::AudioInputBufferParameters) +

+ media::AudioBus::CalculateMemorySize(sink_params);

+ shared_memory_.reset(new base::SharedMemory());

+ ASSERT_TRUE(shared_memory_->CreateAndMapAnonymous(shared_memory_size));

+ ASSERT_TRUE(shared_memory_->ShareToProcess(base::GetCurrentProcessHandle(),

+ foreign_memory_handle));

+ // Wrap the shared memory for the audio bus.

+ media::AudioInputBuffer* buffer =

+ static_cast<media::AudioInputBuffer*>(shared_memory_->memory());

+ audio_track_bus_ = media::AudioBus::WrapMemory(sink_params, buffer->audio);

+ // Reference to the counter used to synchronize.

+ buffer_index_ = &(buffer->params.size);

+ *buffer_index_ = 0U;

+ // Create a shared buffer for the |MockSyncSocket|s.

+ shared_buffer_.reset(new MockSyncSocket::SharedBuffer());

+ // Local socket will receive signals from the producer.

+ local_socket_.reset(new MockSyncSocket(shared_buffer_.get()));

+ // We automatically trigger a Receive when data is sent over the socket.

+ foreign_socket_ = new MockSyncSocket(

+ shared_buffer_.get(),

+ base::Bind(&FakeSpeechRecognizer::EmulateReceiveThreadLoopIteration,

+ base::Unretained(this)));

+ // This is usually done to pair the sockets. Here it's not effective.

+ base::SyncSocket::CreatePair(local_socket_.get(), foreign_socket_);

+ }

+ // Emulates a single iteraton of a thread receiving on the socket.

+ // This would normally be done on a receiving thread's task on the browser.

+ void EmulateReceiveThreadLoopIteration() {

+ // When not responsive do nothing as if the process is busy.

+ if (!is_responsive_)

+ return;

+ local_socket_->Receive(buffer_index_, sizeof(*buffer_index_));

+ // Notify the producer that the audio buffer has been consumed.

+ ++(*buffer_index_);

+ }

+ // Used to simulate an unresponsive behaviour of the consumer.

+ void SimulateResponsiveness(bool is_responsive) {

+ is_responsive_ = is_responsive;

+ }

+ MockSyncSocket* foreign_socket() { return foreign_socket_; }

+ media::AudioBus* audio_bus() const { return audio_track_bus_.get(); }

+ uint32 buffer_index() { return *buffer_index_; }

+ private:

+ bool is_responsive_;

+ // Shared memory for the audio and synchronization.

+ scoped_ptr<base::SharedMemory> shared_memory_;

+ // Fake sockets and their shared buffer.

+ scoped_ptr<MockSyncSocket::SharedBuffer> shared_buffer_;

+ scoped_ptr<MockSyncSocket> local_socket_;

+ MockSyncSocket* foreign_socket_;

+ // Audio bus wrapping the shared memory from the renderer.

+ scoped_ptr<media::AudioBus> audio_track_bus_;

+ // Used for synchronization of sent/received buffers.

+ uint32* buffer_index_;

+};

+} // namespace

+namespace content {

+class SpeechRecognitionAudioSinkTest : public testing::Test {

+ public:

+ SpeechRecognitionAudioSinkTest() { }

+ // Initializes the producer and consumer with specified audio parameters.

+ // Returns the minimal number of input audio buffers which need to be captured

+ // before they get sent to the consumer.

+ uint32 Initialize(int input_sample_rate,

+ int input_frames_per_buffer,

+ int output_sample_rate,

+ int output_frames_per_buffer) {

+ // Audio Environment setup.

+ source_params_.Reset(kInputFormat,

+ kInputChannelLayout,

+ kInputChannels,

+ input_sample_rate,

+ kInputBitsPerSample,

+ input_frames_per_buffer);

+ sink_params_.Reset(kOutputFormat,

+ kOutputChannelLayout,

+ kOutputChannels,

+ output_sample_rate,

+ kOutputBitsPerSample,

+ output_frames_per_buffer);

+ source_data_.reset(new int16[input_frames_per_buffer * kInputChannels]);

+ // Prepare the track and audio source.

+ blink::WebMediaStreamTrack blink_track;

+ PrepareBlinkTrackOfType(MEDIA_DEVICE_AUDIO_CAPTURE, &blink_track);

+ // Get the native track from the blink track and initialize.

+ native_track_ =

+ static_cast<WebRtcLocalAudioTrack*>(blink_track.extraData());

+ native_track_->OnSetFormat(source_params_);

+ // Create and initialize the consumer.

+ recognizer_.reset(new FakeSpeechRecognizer());

+ base::SharedMemoryHandle foreign_memory_handle;

+ recognizer_->Initialize(blink_track, sink_params_, &foreign_memory_handle);

+ // Create the producer.

+ scoped_ptr<base::SyncSocket> foreign_socket(recognizer_->foreign_socket());

+ speech_audio_sink_.reset(new SpeechRecognitionAudioSink(

+ blink_track, sink_params_, foreign_memory_handle,

+ foreign_socket.Pass(),

+ base::Bind(&SpeechRecognitionAudioSinkTest::StoppedCallback,

+ base::Unretained(this))));

+ // Return number of buffers needed to trigger resampling and consumption.

+ return static_cast<uint32>(std::ceil(

+ static_cast<double>(output_frames_per_buffer * input_sample_rate) /

+ (input_frames_per_buffer * output_sample_rate)));

+ }

+ // Mock callback expected to be called when the track is stopped.

+ MOCK_METHOD0(StoppedCallback, void());

+ protected:

+ // Prepares a blink track of a given MediaStreamType and attaches the native

+ // track which can be used to capture audio data and pass it to the producer.

+ static void PrepareBlinkTrackOfType(

+ const MediaStreamType device_type,

+ blink::WebMediaStreamTrack* blink_track) {

+ StreamDeviceInfo device_info(device_type, "Mock device",

+ "mock_device_id");

+ MockMediaConstraintFactory constraint_factory;

+ const blink::WebMediaConstraints constraints =

+ constraint_factory.CreateWebMediaConstraints();

+ scoped_refptr<WebRtcAudioCapturer> capturer(

+ WebRtcAudioCapturer::CreateCapturer(-1, device_info, constraints, NULL,

+ NULL));

+ scoped_refptr<WebRtcLocalAudioTrackAdapter> adapter(

+ WebRtcLocalAudioTrackAdapter::Create(std::string(), NULL));

+ scoped_ptr<WebRtcLocalAudioTrack> native_track(

+ new WebRtcLocalAudioTrack(adapter.get(), capturer, NULL));

+ blink::WebMediaStreamSource blink_audio_source;

+ blink_audio_source.initialize(base::UTF8ToUTF16("dummy_source_id"),

+ blink::WebMediaStreamSource::TypeAudio,

+ base::UTF8ToUTF16("dummy_source_name"));

+ MediaStreamSource::SourceStoppedCallback cb;

+ blink_audio_source.setExtraData(

+ new MediaStreamAudioSource(-1, device_info, cb, NULL));

+ blink_track->initialize(blink::WebString::fromUTF8("dummy_track"),

+ blink_audio_source);

+ blink_track->setExtraData(native_track.release());

+ }

+ // Emulates an audio capture device capturing data from the source.

+ inline void CaptureAudio(const uint32 buffers) {

+ for (uint32 i = 0; i < buffers; ++i)

+ native_track_->Capture(source_data_.get(),

+ base::TimeDelta::FromMilliseconds(0), 1, false,

+ false);

+ }

+ // Used to simulate a problem with sockets.

+ void SetFailureModeOnForeignSocket(bool in_failure_mode) {

+ recognizer_->foreign_socket()->SetFailureMode(in_failure_mode);

+ }

+ // Helper method for verifying captured audio data has been consumed.

+ inline void AssertConsumedBuffers(const uint32 buffer_index) {

+ ASSERT_EQ(buffer_index, recognizer_->buffer_index());

+ }

+ // Helper method for providing audio data to producer and verifying it was

+ // consumed on the recognizer.

+ inline void CaptureAudioAndAssertConsumedBuffers(const uint32 buffers,

+ const uint32 buffer_index) {

+ CaptureAudio(buffers);

+ AssertConsumedBuffers(buffer_index);

+ }

+ // Helper method to capture and assert consumption at different sample rates

+ // and audio buffer sizes.

+ inline void AssertConsumptionForAudioParameters(

+ const int input_sample_rate,

+ const int input_frames_per_buffer,

+ const int output_sample_rate,

+ const int output_frames_per_buffer,

+ const uint32 consumptions) {

+ const uint32 kBuffersPerNotification =

+ Initialize(input_sample_rate, input_frames_per_buffer,

+ output_sample_rate, output_frames_per_buffer);

+ AssertConsumedBuffers(0U);

+ for (uint32 i = 1U; i <= consumptions; ++i) {

+ CaptureAudio(kBuffersPerNotification);

+ ASSERT_EQ(i, recognizer_->buffer_index())

+ << "Tested at rates: "

+ << "In(" << input_sample_rate << ", " << input_frames_per_buffer

+ << ") "

+ << "Out(" << output_sample_rate << ", " << output_frames_per_buffer

+ << ")";

+ }

+ // Producer.

+ scoped_ptr<SpeechRecognitionAudioSink> speech_audio_sink_;

+ // Consumer.

+ scoped_ptr<FakeSpeechRecognizer> recognizer_;

+ // Audio related members.

+ scoped_ptr<int16[]> source_data_;

+ media::AudioParameters source_params_;

+ media::AudioParameters sink_params_;

+ WebRtcLocalAudioTrack* native_track_;

+};

+// Not all types of tracks are supported. This test checks if that policy is

+// implemented correctly.

+TEST_F(SpeechRecognitionAudioSinkTest, CheckIsSupportedAudioTrack) {

+ typedef std::map<MediaStreamType, bool> SupportedTrackPolicy;

+ // This test must be aligned with the policy of supported tracks.

+ SupportedTrackPolicy p;

+ p[MEDIA_NO_SERVICE] = false;

+ p[MEDIA_DEVICE_AUDIO_CAPTURE] = true; // The only one supported for now.

+ p[MEDIA_DEVICE_VIDEO_CAPTURE] = false;

+ p[MEDIA_TAB_AUDIO_CAPTURE] = false;

+ p[MEDIA_TAB_VIDEO_CAPTURE] = false;

+ p[MEDIA_DESKTOP_VIDEO_CAPTURE] = false;

+ p[MEDIA_LOOPBACK_AUDIO_CAPTURE] = false;

+ p[MEDIA_DEVICE_AUDIO_OUTPUT] = false;

+ // Ensure this test gets updated along with |content::MediaStreamType| enum.

+ EXPECT_EQ(NUM_MEDIA_TYPES, p.size());

+ // Check the the entire policy.

+ for (SupportedTrackPolicy::iterator it = p.begin(); it != p.end(); ++it) {

+ blink::WebMediaStreamTrack blink_track;

+ PrepareBlinkTrackOfType(it->first, &blink_track);

+ ASSERT_EQ(

+ it->second,

+ SpeechRecognitionAudioSink::IsSupportedTrack(blink_track));

+ }

+// Checks if the producer can support the listed range of input sample rates

+// and associated buffer sizes.

+TEST_F(SpeechRecognitionAudioSinkTest, RecognizerNotifiedOnSocket) {

+ const size_t kNumAudioParamTuples = 24;

+ const int kAudioParams[kNumAudioParamTuples][2] = {

+ {8000, 80}, {8000, 800}, {16000, 160}, {16000, 1600},

+ {24000, 240}, {24000, 2400}, {32000, 320}, {32000, 3200},

+ {44100, 441}, {44100, 4410}, {48000, 480}, {48000, 4800},

+ {96000, 960}, {96000, 9600}, {11025, 111}, {11025, 1103},

+ {22050, 221}, {22050, 2205}, {88200, 882}, {88200, 8820},

+ {176400, 1764}, {176400, 17640}, {192000, 1920}, {192000, 19200}};

+ // Check all listed tuples of input sample rates and buffers sizes.

+ for (size_t i = 0; i < kNumAudioParamTuples; ++i) {

+ AssertConsumptionForAudioParameters(

+ kAudioParams[i][0], kAudioParams[i][1],

+ kSpeechRecognitionSampleRate, kSpeechRecognitionFramesPerBuffer, 3U);

+ }

+// Checks that the input data is getting resampled to the target sample rate.

+TEST_F(SpeechRecognitionAudioSinkTest, AudioDataIsResampledOnSink) {

+ EXPECT_GE(kInputChannels, 1);

+ EXPECT_GE(kOutputChannels, 1);

+ // Input audio is sampled at 44.1 KHz with data chunks of 10ms. Desired output

+ // is corresponding to the speech recognition engine requirements: 16 KHz with

+ // 100 ms chunks (1600 frames per buffer).

+ const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);

+ // Fill audio input frames with 0, 1, 2, 3, ..., 440.

+ const uint32 kSourceDataLength = 441 * kInputChannels;

+ for (uint32 i = 0; i < kSourceDataLength; ++i) {

+ for (int c = 0; c < kInputChannels; ++c)

+ source_data_[i * kInputChannels + c] = i;

+ }

+ // Prepare sink audio bus and data for rendering.

+ media::AudioBus* sink_bus = recognizer_->audio_bus();

+ const uint32 kSinkDataLength = 1600 * kOutputChannels;

+ int16 sink_data[kSinkDataLength] = {0};

+ // Render the audio data from the recognizer.

+ sink_bus->ToInterleaved(sink_bus->frames(),

+ sink_params_.bits_per_sample() / 8, sink_data);

+ // Checking only a fraction of the sink frames.

+ const uint32 kNumFramesToTest = 12;

+ // Check all channels are zeroed out before we trigger resampling.

+ for (uint32 i = 0; i < kNumFramesToTest; ++i) {

+ for (int c = 0; c < kOutputChannels; ++c)

+ EXPECT_EQ(0, sink_data[i * kOutputChannels + c]);

+ }

+ // Trigger the speech sink to resample the input data.

+ AssertConsumedBuffers(0U);

+ CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

+ // Render the audio data from the recognizer.

+ sink_bus->ToInterleaved(sink_bus->frames(),

+ sink_params_.bits_per_sample() / 8, sink_data);

+ // Resampled data expected frames. Extracted based on |source_data_|.

+ const int16 kExpectedData[kNumFramesToTest] = {0, 2, 5, 8, 11, 13,

+ 16, 19, 22, 24, 27, 30};

+ // Check all channels have the same resampled data.

+ for (uint32 i = 0; i < kNumFramesToTest; ++i) {

+ for (int c = 0; c < kOutputChannels; ++c)

+ EXPECT_EQ(kExpectedData[i], sink_data[i * kOutputChannels + c]);

+ }

+// Checks that the producer does not misbehave when a socket failure occurs.

+TEST_F(SpeechRecognitionAudioSinkTest, SyncSocketFailsSendingData) {

+ const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);

+ // Start with no problems on the socket.

+ AssertConsumedBuffers(0U);

+ CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

+ // A failure occurs (socket cannot send).

+ SetFailureModeOnForeignSocket(true);

+ CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

+// Checks that an OnStoppedCallback is issued when the track is stopped.

+TEST_F(SpeechRecognitionAudioSinkTest, OnReadyStateChangedOccured) {

+ const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);

+ AssertConsumedBuffers(0U);

+ CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

+ EXPECT_CALL(*this, StoppedCallback()).Times(1);

+ native_track_->Stop();

+ CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

+} // namespace content