content/renderer/media/speech_recognition_audio_sink_unittest.cc - Issue 499233003: Binding media stream audio track to speech recognition [renderer]

Side by Side Diff: content/renderer/media/speech_recognition_audio_sink_unittest.cc

Issue 499233003: Binding media stream audio track to speech recognition [renderer] (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Add ENABLE_WEBRTC flag checks Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « content/renderer/media/speech_recognition_audio_sink.cc ('k') | content/renderer/speech_recognition_dispatcher.h » ('j') | content/renderer/speech_recognition_dispatcher.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2014 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "content/renderer/media/speech_recognition_audio_sink.h"

	6

	7 #include "base/bind.h"

	8 #include "base/strings/utf_string_conversions.h"

	9 #include "content/renderer/media/media_stream_audio_source.h"

	10 #include "content/renderer/media/mock_media_constraint_factory.h"

	11 #include "content/renderer/media/webrtc/webrtc_local_audio_track_adapter.h"

	12 #include "content/renderer/media/webrtc_local_audio_track.h"

	13 #include "media/audio/audio_parameters.h"

	14 #include "media/base/audio_bus.h"

	15 #include "testing/gmock/include/gmock/gmock.h"

	16 #include "testing/gtest/include/gtest/gtest.h"

	17 #include "third_party/WebKit/public/platform/WebMediaStreamTrack.h"

	18

	19 namespace {

	20

	21 // Supported speech recognition audio parameters.

	22 const int kSpeechRecognitionSampleRate = 16000;

	23 const int kSpeechRecognitionFramesPerBuffer = 1600;

	24

	25 // Input audio format.

	26 const media::AudioParameters::Format kInputFormat =

	27 media::AudioParameters::AUDIO_PCM_LOW_LATENCY;

	28 const media::ChannelLayout kInputChannelLayout = media::CHANNEL_LAYOUT_MONO;

	29 const int kInputChannels = 1;

	30 const int kInputBitsPerSample = 16;

	31

	32 // Output audio format.

	33 const media::AudioParameters::Format kOutputFormat =

	34 media::AudioParameters::AUDIO_PCM_LOW_LATENCY;

	35 const media::ChannelLayout kOutputChannelLayout = media::CHANNEL_LAYOUT_STEREO;

	36 const int kOutputChannels = 2;

	37 const int kOutputBitsPerSample = 16;

	38

	39 // Mocked out sockets used for Send/Receive.

	40 // Data is written and read from a shared buffer used as a FIFO and there is

	41 // no blocking. \|OnSendCB\| is used to trigger a \|Receive\| on the other socket.

	42 class MockSyncSocket : public base::SyncSocket {

	43 public:

	44 // This allows for 2 requests in queue between the \|MockSyncSocket\|s.

	45 static const int kSharedBufferSize = 8;

	46

	47 // Buffer to be shared between two \|MockSyncSocket\|s. Allocated on heap.

	48 struct SharedBuffer {

	49 SharedBuffer() : data(), start(0), length(0) {}

	50

	51 uint8 data[kSharedBufferSize];

	52 size_t start;

	53 size_t length;

	54 };

	55

	56 // Callback used for pairing an A.Send() with B.Receieve() without blocking.

	57 typedef base::Callback<void()> OnSendCB;

	58

	59 explicit MockSyncSocket(SharedBuffer* shared_buffer)

	60 : buffer_(shared_buffer),

	61 in_failure_mode_(false) {}

	62

	63 MockSyncSocket(SharedBuffer* shared_buffer, const OnSendCB& on_send_cb)

	64 : buffer_(shared_buffer),

	65 on_send_cb_(on_send_cb),

	66 in_failure_mode_(false) {}

	67

	68 virtual size_t Send(const void* buffer, size_t length) override;

	69 virtual size_t Receive(void* buffer, size_t length) override;

	70

	71 // When \|in_failure_mode_\| == true, the socket fails to send.

	72 void SetFailureMode(bool in_failure_mode) {

	73 in_failure_mode_ = in_failure_mode;

	74 }

	75

	76 private:

	77 SharedBuffer* buffer_;

	78 const OnSendCB on_send_cb_;

	79 bool in_failure_mode_;

	80

	81 DISALLOW_COPY_AND_ASSIGN(MockSyncSocket);

	82 };

	83

	84 // base::SyncSocket implementation

	85 size_t MockSyncSocket::Send(const void* buffer, size_t length) {

	86 if (in_failure_mode_)

	87 return 0;

	88

	89 const uint8* b = static_cast<const uint8*>(buffer);

	90 for (size_t i = 0; i < length; ++i, ++buffer_->length)

	91 buffer_->data[buffer_->start + buffer_->length] = b[i];

	92

	93 on_send_cb_.Run();

	94 return length;

	95 }

	96

	97 size_t MockSyncSocket::Receive(void* buffer, size_t length) {

	98 uint8* b = static_cast<uint8*>(buffer);

	99 for (size_t i = buffer_->start; i < buffer_->length; ++i, ++buffer_->start)

	100 b[i] = buffer_->data[buffer_->start];

	101

	102 // Since buffer is used sequentially, we can reset the buffer indices here.

	103 buffer_->start = buffer_->length = 0;

	104 return length;

	105 }

	106

	107 // This fake class is the consumer used to verify behaviour of the producer.

	108 // The \|Initialize()\| method shows what the consumer should be responsible for

	109 // in the production code (minus the mocks).

	110 class FakeSpeechRecognizer {

	111 public:

	112 FakeSpeechRecognizer() : is_responsive_(true) { }

	113

	114 void Initialize(

	115 const blink::WebMediaStreamTrack& track,

	116 const media::AudioParameters& sink_params,

	117 base::SharedMemoryHandle* foreign_memory_handle) {

	118 // Shared memory is allocated, mapped and shared.

	119 uint32 shared_memory_size =

	120 sizeof(media::AudioInputBufferParameters) +

	121 media::AudioBus::CalculateMemorySize(sink_params);

	122 shared_memory_.reset(new base::SharedMemory());

	123 ASSERT_TRUE(shared_memory_->CreateAndMapAnonymous(shared_memory_size));

	124 ASSERT_TRUE(shared_memory_->ShareToProcess(base::GetCurrentProcessHandle(),

	125 foreign_memory_handle));

	126

	127 // Wrap the shared memory for the audio bus.

	128 media::AudioInputBuffer* buffer =

	129 static_cast<media::AudioInputBuffer*>(shared_memory_->memory());

	130 audio_track_bus_ = media::AudioBus::WrapMemory(sink_params, buffer->audio);

	131

	132 // Reference to the counter used to synchronize.

	133 buffer_index_ = &(buffer->params.size);

	134 *buffer_index_ = 0U;

	135

	136 // Create a shared buffer for the \|MockSyncSocket\|s.

	137 shared_buffer_.reset(new MockSyncSocket::SharedBuffer());

	138

	139 // Local socket will receive signals from the producer.

	140 local_socket_.reset(new MockSyncSocket(shared_buffer_.get()));

	141

	142 // We automatically trigger a Receive when data is sent over the socket.

	143 foreign_socket_ = new MockSyncSocket(

	144 shared_buffer_.get(),

	145 base::Bind(&FakeSpeechRecognizer::EmulateReceiveThreadLoopIteration,

	146 base::Unretained(this)));

	147

	148 // This is usually done to pair the sockets. Here it's not effective.

	149 base::SyncSocket::CreatePair(local_socket_.get(), foreign_socket_);

	150 }

	151

	152 // Emulates a single iteraton of a thread receiving on the socket.

	153 // This would normally be done on a receiving thread's task on the browser.

	154 void EmulateReceiveThreadLoopIteration() {

	155 // When not responsive do nothing as if the process is busy.

	156 if (!is_responsive_)

	157 return;

	158

	159 local_socket_->Receive(buffer_index_, sizeof(*buffer_index_));

	160 // Notify the producer that the audio buffer has been consumed.

	161 ++(*buffer_index_);

	162 }

	163

	164 // Used to simulate an unresponsive behaviour of the consumer.

	165 void SimulateResponsiveness(bool is_responsive) {

	166 is_responsive_ = is_responsive;

	167 }

	168

	169 MockSyncSocket* foreign_socket() { return foreign_socket_; }

	170 media::AudioBus* audio_bus() const { return audio_track_bus_.get(); }

	171 uint32 buffer_index() { return *buffer_index_; }

	172

	173 private:

	174 bool is_responsive_;

	175

	176 // Shared memory for the audio and synchronization.

	177 scoped_ptr<base::SharedMemory> shared_memory_;

	178

	179 // Fake sockets and their shared buffer.

	180 scoped_ptr<MockSyncSocket::SharedBuffer> shared_buffer_;

	181 scoped_ptr<MockSyncSocket> local_socket_;

	182 MockSyncSocket* foreign_socket_;

	183

	184 // Audio bus wrapping the shared memory from the renderer.

	185 scoped_ptr<media::AudioBus> audio_track_bus_;

	186

	187 // Used for synchronization of sent/received buffers.

	188 uint32* buffer_index_;

	189

	190 DISALLOW_COPY_AND_ASSIGN(FakeSpeechRecognizer);

	191 };

	192

	193 } // namespace

	194

	195 namespace content {

	196

	197 class SpeechRecognitionAudioSinkTest : public testing::Test {

	198 public:

	199 SpeechRecognitionAudioSinkTest() {}

	200

	201 ~SpeechRecognitionAudioSinkTest() {}

	202

	203 // Initializes the producer and consumer with specified audio parameters.

	204 // Returns the minimal number of input audio buffers which need to be captured

	205 // before they get sent to the consumer.

	206 uint32 Initialize(int input_sample_rate,

	207 int input_frames_per_buffer,

	208 int output_sample_rate,

	209 int output_frames_per_buffer) {

	210 // Audio Environment setup.

	211 source_params_.Reset(kInputFormat,

	212 kInputChannelLayout,

	213 kInputChannels,

	214 input_sample_rate,

	215 kInputBitsPerSample,

	216 input_frames_per_buffer);

	217 sink_params_.Reset(kOutputFormat,

	218 kOutputChannelLayout,

	219 kOutputChannels,

	220 output_sample_rate,

	221 kOutputBitsPerSample,

	222 output_frames_per_buffer);

	223 source_data_.reset(new int16[input_frames_per_buffer * kInputChannels]);

	224

	225 // Prepare the track and audio source.

	226 blink::WebMediaStreamTrack blink_track;

	227 PrepareBlinkTrackOfType(MEDIA_DEVICE_AUDIO_CAPTURE, &blink_track);

	228

	229 // Get the native track from the blink track and initialize.

	230 native_track_ =

	231 static_cast<WebRtcLocalAudioTrack*>(blink_track.extraData());

	232 native_track_->OnSetFormat(source_params_);

	233

	234 // Create and initialize the consumer.

	235 recognizer_.reset(new FakeSpeechRecognizer());

	236 base::SharedMemoryHandle foreign_memory_handle;

	237 recognizer_->Initialize(blink_track, sink_params_, &foreign_memory_handle);

	238

	239 // Create the producer.

	240 scoped_ptr<base::SyncSocket> foreign_socket(recognizer_->foreign_socket());

	241 speech_audio_sink_.reset(new SpeechRecognitionAudioSink(

	242 blink_track, sink_params_, foreign_memory_handle,

	243 foreign_socket.Pass(),

	244 base::Bind(&SpeechRecognitionAudioSinkTest::StoppedCallback,

	245 base::Unretained(this))));

	246

	247 // Return number of buffers needed to trigger resampling and consumption.

	248 return static_cast<uint32>(std::ceil(

	249 static_cast<double>(output_frames_per_buffer * input_sample_rate) /

	250 (input_frames_per_buffer * output_sample_rate)));

	251 }

	252

	253 // Mock callback expected to be called when the track is stopped.

	254 MOCK_METHOD0(StoppedCallback, void());

	255

	256 protected:

	257 // Prepares a blink track of a given MediaStreamType and attaches the native

	258 // track which can be used to capture audio data and pass it to the producer.

	259 static void PrepareBlinkTrackOfType(

	260 const MediaStreamType device_type,

	261 blink::WebMediaStreamTrack* blink_track) {

	262 StreamDeviceInfo device_info(device_type, "Mock device",

	263 "mock_device_id");

	264 MockMediaConstraintFactory constraint_factory;

	265 const blink::WebMediaConstraints constraints =

	266 constraint_factory.CreateWebMediaConstraints();

	267 scoped_refptr<WebRtcAudioCapturer> capturer(

	268 WebRtcAudioCapturer::CreateCapturer(-1, device_info, constraints, NULL,

	269 NULL));

	270 scoped_refptr<WebRtcLocalAudioTrackAdapter> adapter(

	271 WebRtcLocalAudioTrackAdapter::Create(std::string(), NULL));

	272 scoped_ptr<WebRtcLocalAudioTrack> native_track(

	273 new WebRtcLocalAudioTrack(adapter.get(), capturer, NULL));

	274 blink::WebMediaStreamSource blink_audio_source;

	275 blink_audio_source.initialize(base::UTF8ToUTF16("dummy_source_id"),

	276 blink::WebMediaStreamSource::TypeAudio,

	277 base::UTF8ToUTF16("dummy_source_name"));

	278 MediaStreamSource::SourceStoppedCallback cb;

	279 blink_audio_source.setExtraData(

	280 new MediaStreamAudioSource(-1, device_info, cb, NULL));

	281 blink_track->initialize(blink::WebString::fromUTF8("dummy_track"),

	282 blink_audio_source);

	283 blink_track->setExtraData(native_track.release());

	284 }

	285

	286 // Emulates an audio capture device capturing data from the source.

	287 inline void CaptureAudio(const uint32 buffers) {

	288 for (uint32 i = 0; i < buffers; ++i)

	289 native_track()->Capture(source_data(),

	290 base::TimeDelta::FromMilliseconds(0), 1, false,

	291 false);

	292 }

	293

	294 // Used to simulate a problem with sockets.

	295 void SetFailureModeOnForeignSocket(bool in_failure_mode) {

	296 recognizer()->foreign_socket()->SetFailureMode(in_failure_mode);

	297 }

	298

	299 // Helper method for verifying captured audio data has been consumed.

	300 inline void AssertConsumedBuffers(const uint32 buffer_index) {

	301 ASSERT_EQ(buffer_index, recognizer_->buffer_index());

	302 }

	303

	304 // Helper method for providing audio data to producer and verifying it was

	305 // consumed on the recognizer.

	306 inline void CaptureAudioAndAssertConsumedBuffers(const uint32 buffers,

	307 const uint32 buffer_index) {

	308 CaptureAudio(buffers);

	309 AssertConsumedBuffers(buffer_index);

	310 }

	311

	312 // Helper method to capture and assert consumption at different sample rates

	313 // and audio buffer sizes.

	314 inline void AssertConsumptionForAudioParameters(

	315 const int input_sample_rate,

	316 const int input_frames_per_buffer,

	317 const int output_sample_rate,

	318 const int output_frames_per_buffer,

	319 const uint32 consumptions) {

	320 const uint32 kBuffersPerNotification =

	321 Initialize(input_sample_rate, input_frames_per_buffer,

	322 output_sample_rate, output_frames_per_buffer);

	323 AssertConsumedBuffers(0U);

	324

	325 for (uint32 i = 1U; i <= consumptions; ++i) {

	326 CaptureAudio(kBuffersPerNotification);

	327 ASSERT_EQ(i, recognizer_->buffer_index())

	328 << "Tested at rates: "

	329 << "In(" << input_sample_rate << ", " << input_frames_per_buffer

	330 << ") "

	331 << "Out(" << output_sample_rate << ", " << output_frames_per_buffer

	332 << ")";

	333 }

	334 }

	335

	336 int16* source_data() { return source_data_.get(); }

	337

	338 FakeSpeechRecognizer* recognizer() { return recognizer_.get(); }

	339

	340 const media::AudioParameters& sink_params() { return sink_params_; }

	341

	342 WebRtcLocalAudioTrack* native_track() { return native_track_; }

	343

	344 private:

	345 // Producer.

	346 scoped_ptr<SpeechRecognitionAudioSink> speech_audio_sink_;

	347

	348 // Consumer.

	349 scoped_ptr<FakeSpeechRecognizer> recognizer_;

	350

	351 // Audio related members.

	352 scoped_ptr<int16[]> source_data_;

	353 media::AudioParameters source_params_;

	354 media::AudioParameters sink_params_;

	355 WebRtcLocalAudioTrack* native_track_;

	356

	357 DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionAudioSinkTest);

	358 };

	359

	360 // Not all types of tracks are supported. This test checks if that policy is

	361 // implemented correctly.

	362 TEST_F(SpeechRecognitionAudioSinkTest, CheckIsSupportedAudioTrack) {

	363 typedef std::map<MediaStreamType, bool> SupportedTrackPolicy;

	364

	365 // This test must be aligned with the policy of supported tracks.

	366 SupportedTrackPolicy p;

	367 p[MEDIA_NO_SERVICE] = false;

	368 p[MEDIA_DEVICE_AUDIO_CAPTURE] = true; // The only one supported for now.

	369 p[MEDIA_DEVICE_VIDEO_CAPTURE] = false;

	370 p[MEDIA_TAB_AUDIO_CAPTURE] = false;

	371 p[MEDIA_TAB_VIDEO_CAPTURE] = false;

	372 p[MEDIA_DESKTOP_VIDEO_CAPTURE] = false;

	373 p[MEDIA_LOOPBACK_AUDIO_CAPTURE] = false;

	374 p[MEDIA_DEVICE_AUDIO_OUTPUT] = false;

	375

	376 // Ensure this test gets updated along with \|content::MediaStreamType\| enum.

	377 EXPECT_EQ(NUM_MEDIA_TYPES, p.size());

	378

	379 // Check the the entire policy.

	380 for (SupportedTrackPolicy::iterator it = p.begin(); it != p.end(); ++it) {

	381 blink::WebMediaStreamTrack blink_track;

	382 PrepareBlinkTrackOfType(it->first, &blink_track);

	383 ASSERT_EQ(

	384 it->second,

	385 SpeechRecognitionAudioSink::IsSupportedTrack(blink_track));

	386 }

	387 }

	388

	389 // Checks if the producer can support the listed range of input sample rates

	390 // and associated buffer sizes.

	391 TEST_F(SpeechRecognitionAudioSinkTest, RecognizerNotifiedOnSocket) {

	392 const size_t kNumAudioParamTuples = 24;

	393 const int kAudioParams[kNumAudioParamTuples][2] = {

	394 {8000, 80}, {8000, 800}, {16000, 160}, {16000, 1600},

	395 {24000, 240}, {24000, 2400}, {32000, 320}, {32000, 3200},

	396 {44100, 441}, {44100, 4410}, {48000, 480}, {48000, 4800},

	397 {96000, 960}, {96000, 9600}, {11025, 111}, {11025, 1103},

	398 {22050, 221}, {22050, 2205}, {88200, 882}, {88200, 8820},

	399 {176400, 1764}, {176400, 17640}, {192000, 1920}, {192000, 19200}};

	400

	401 // Check all listed tuples of input sample rates and buffers sizes.

	402 for (size_t i = 0; i < kNumAudioParamTuples; ++i) {

	403 AssertConsumptionForAudioParameters(

	404 kAudioParams[i][0], kAudioParams[i][1],

	405 kSpeechRecognitionSampleRate, kSpeechRecognitionFramesPerBuffer, 3U);

	406 }

	407 }

	408

	409 // Checks that the input data is getting resampled to the target sample rate.

	410 TEST_F(SpeechRecognitionAudioSinkTest, AudioDataIsResampledOnSink) {

	411 EXPECT_GE(kInputChannels, 1);

	412 EXPECT_GE(kOutputChannels, 1);

	413

	414 // Input audio is sampled at 44.1 KHz with data chunks of 10ms. Desired output

	415 // is corresponding to the speech recognition engine requirements: 16 KHz with

	416 // 100 ms chunks (1600 frames per buffer).

	417 const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);

	418 // Fill audio input frames with 0, 1, 2, 3, ..., 440.

	419 const uint32 kSourceDataLength = 441 * kInputChannels;

	420 for (uint32 i = 0; i < kSourceDataLength; ++i) {

	421 for (int c = 0; c < kInputChannels; ++c)

	422 source_data()[i * kInputChannels + c] = i;

	423 }

	424

	425 // Prepare sink audio bus and data for rendering.

	426 media::AudioBus* sink_bus = recognizer()->audio_bus();

	427 const uint32 kSinkDataLength = 1600 * kOutputChannels;

	428 int16 sink_data[kSinkDataLength] = {0};

	429

	430 // Render the audio data from the recognizer.

	431 sink_bus->ToInterleaved(sink_bus->frames(),

	432 sink_params().bits_per_sample() / 8, sink_data);

	433

	434 // Checking only a fraction of the sink frames.

	435 const uint32 kNumFramesToTest = 12;

	436

	437 // Check all channels are zeroed out before we trigger resampling.

	438 for (uint32 i = 0; i < kNumFramesToTest; ++i) {

	439 for (int c = 0; c < kOutputChannels; ++c)

	440 EXPECT_EQ(0, sink_data[i * kOutputChannels + c]);

	441 }

	442

	443 // Trigger the speech sink to resample the input data.

	444 AssertConsumedBuffers(0U);

	445 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

	446

	447 // Render the audio data from the recognizer.

	448 sink_bus->ToInterleaved(sink_bus->frames(),

	449 sink_params().bits_per_sample() / 8, sink_data);

	450

	451 // Resampled data expected frames. Extracted based on \|source_data()\|.

	452 const int16 kExpectedData[kNumFramesToTest] = {0, 2, 5, 8, 11, 13,

	453 16, 19, 22, 24, 27, 30};

	454

	455 // Check all channels have the same resampled data.

	456 for (uint32 i = 0; i < kNumFramesToTest; ++i) {

	457 for (int c = 0; c < kOutputChannels; ++c)

	458 EXPECT_EQ(kExpectedData[i], sink_data[i * kOutputChannels + c]);

	459 }

	460 }

	461

	462 // Checks that the producer does not misbehave when a socket failure occurs.

	463 TEST_F(SpeechRecognitionAudioSinkTest, SyncSocketFailsSendingData) {

	464 const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);

	465 // Start with no problems on the socket.

	466 AssertConsumedBuffers(0U);

	467 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

	468

	469 // A failure occurs (socket cannot send).

	470 SetFailureModeOnForeignSocket(true);

	471 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

	472 }

	473

	474 // Checks that an OnStoppedCallback is issued when the track is stopped.

	475 TEST_F(SpeechRecognitionAudioSinkTest, OnReadyStateChangedOccured) {

	476 const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);

	477 AssertConsumedBuffers(0U);

	478 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

	479 EXPECT_CALL(*this, StoppedCallback()).Times(1);

	480

	481 native_track()->Stop();

	482 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);

	483 }

	484

	485 } // namespace content

OLD	NEW