Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(393)

Side by Side Diff: content/renderer/media/speech_recognition_audio_sink_unittest.cc

Issue 499233003: Binding media stream audio track to speech recognition [renderer] (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Add ENABLE_WEBRTC flag checks Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "content/renderer/media/speech_recognition_audio_sink.h"
6
7 #include "base/bind.h"
8 #include "base/strings/utf_string_conversions.h"
9 #include "content/renderer/media/media_stream_audio_source.h"
10 #include "content/renderer/media/mock_media_constraint_factory.h"
11 #include "content/renderer/media/webrtc/webrtc_local_audio_track_adapter.h"
12 #include "content/renderer/media/webrtc_local_audio_track.h"
13 #include "media/audio/audio_parameters.h"
14 #include "media/base/audio_bus.h"
15 #include "testing/gmock/include/gmock/gmock.h"
16 #include "testing/gtest/include/gtest/gtest.h"
17 #include "third_party/WebKit/public/platform/WebMediaStreamTrack.h"
18
19 namespace {
20
21 // Supported speech recognition audio parameters.
22 const int kSpeechRecognitionSampleRate = 16000;
23 const int kSpeechRecognitionFramesPerBuffer = 1600;
24
25 // Input audio format.
26 const media::AudioParameters::Format kInputFormat =
27 media::AudioParameters::AUDIO_PCM_LOW_LATENCY;
28 const media::ChannelLayout kInputChannelLayout = media::CHANNEL_LAYOUT_MONO;
29 const int kInputChannels = 1;
30 const int kInputBitsPerSample = 16;
31
32 // Output audio format.
33 const media::AudioParameters::Format kOutputFormat =
34 media::AudioParameters::AUDIO_PCM_LOW_LATENCY;
35 const media::ChannelLayout kOutputChannelLayout = media::CHANNEL_LAYOUT_STEREO;
36 const int kOutputChannels = 2;
37 const int kOutputBitsPerSample = 16;
38
39 // Mocked out sockets used for Send/Receive.
40 // Data is written and read from a shared buffer used as a FIFO and there is
41 // no blocking. |OnSendCB| is used to trigger a |Receive| on the other socket.
42 class MockSyncSocket : public base::SyncSocket {
43 public:
44 // This allows for 2 requests in queue between the |MockSyncSocket|s.
45 static const int kSharedBufferSize = 8;
46
47 // Buffer to be shared between two |MockSyncSocket|s. Allocated on heap.
48 struct SharedBuffer {
49 SharedBuffer() : data(), start(0), length(0) {}
50
51 uint8 data[kSharedBufferSize];
52 size_t start;
53 size_t length;
54 };
55
56 // Callback used for pairing an A.Send() with B.Receieve() without blocking.
57 typedef base::Callback<void()> OnSendCB;
58
59 explicit MockSyncSocket(SharedBuffer* shared_buffer)
60 : buffer_(shared_buffer),
61 in_failure_mode_(false) {}
62
63 MockSyncSocket(SharedBuffer* shared_buffer, const OnSendCB& on_send_cb)
64 : buffer_(shared_buffer),
65 on_send_cb_(on_send_cb),
66 in_failure_mode_(false) {}
67
68 virtual size_t Send(const void* buffer, size_t length) override;
69 virtual size_t Receive(void* buffer, size_t length) override;
70
71 // When |in_failure_mode_| == true, the socket fails to send.
72 void SetFailureMode(bool in_failure_mode) {
73 in_failure_mode_ = in_failure_mode;
74 }
75
76 private:
77 SharedBuffer* buffer_;
78 const OnSendCB on_send_cb_;
79 bool in_failure_mode_;
80
81 DISALLOW_COPY_AND_ASSIGN(MockSyncSocket);
82 };
83
84 // base::SyncSocket implementation
85 size_t MockSyncSocket::Send(const void* buffer, size_t length) {
86 if (in_failure_mode_)
87 return 0;
88
89 const uint8* b = static_cast<const uint8*>(buffer);
90 for (size_t i = 0; i < length; ++i, ++buffer_->length)
91 buffer_->data[buffer_->start + buffer_->length] = b[i];
92
93 on_send_cb_.Run();
94 return length;
95 }
96
97 size_t MockSyncSocket::Receive(void* buffer, size_t length) {
98 uint8* b = static_cast<uint8*>(buffer);
99 for (size_t i = buffer_->start; i < buffer_->length; ++i, ++buffer_->start)
100 b[i] = buffer_->data[buffer_->start];
101
102 // Since buffer is used sequentially, we can reset the buffer indices here.
103 buffer_->start = buffer_->length = 0;
104 return length;
105 }
106
107 // This fake class is the consumer used to verify behaviour of the producer.
108 // The |Initialize()| method shows what the consumer should be responsible for
109 // in the production code (minus the mocks).
110 class FakeSpeechRecognizer {
111 public:
112 FakeSpeechRecognizer() : is_responsive_(true) { }
113
114 void Initialize(
115 const blink::WebMediaStreamTrack& track,
116 const media::AudioParameters& sink_params,
117 base::SharedMemoryHandle* foreign_memory_handle) {
118 // Shared memory is allocated, mapped and shared.
119 uint32 shared_memory_size =
120 sizeof(media::AudioInputBufferParameters) +
121 media::AudioBus::CalculateMemorySize(sink_params);
122 shared_memory_.reset(new base::SharedMemory());
123 ASSERT_TRUE(shared_memory_->CreateAndMapAnonymous(shared_memory_size));
124 ASSERT_TRUE(shared_memory_->ShareToProcess(base::GetCurrentProcessHandle(),
125 foreign_memory_handle));
126
127 // Wrap the shared memory for the audio bus.
128 media::AudioInputBuffer* buffer =
129 static_cast<media::AudioInputBuffer*>(shared_memory_->memory());
130 audio_track_bus_ = media::AudioBus::WrapMemory(sink_params, buffer->audio);
131
132 // Reference to the counter used to synchronize.
133 buffer_index_ = &(buffer->params.size);
134 *buffer_index_ = 0U;
135
136 // Create a shared buffer for the |MockSyncSocket|s.
137 shared_buffer_.reset(new MockSyncSocket::SharedBuffer());
138
139 // Local socket will receive signals from the producer.
140 local_socket_.reset(new MockSyncSocket(shared_buffer_.get()));
141
142 // We automatically trigger a Receive when data is sent over the socket.
143 foreign_socket_ = new MockSyncSocket(
144 shared_buffer_.get(),
145 base::Bind(&FakeSpeechRecognizer::EmulateReceiveThreadLoopIteration,
146 base::Unretained(this)));
147
148 // This is usually done to pair the sockets. Here it's not effective.
149 base::SyncSocket::CreatePair(local_socket_.get(), foreign_socket_);
150 }
151
152 // Emulates a single iteraton of a thread receiving on the socket.
153 // This would normally be done on a receiving thread's task on the browser.
154 void EmulateReceiveThreadLoopIteration() {
155 // When not responsive do nothing as if the process is busy.
156 if (!is_responsive_)
157 return;
158
159 local_socket_->Receive(buffer_index_, sizeof(*buffer_index_));
160 // Notify the producer that the audio buffer has been consumed.
161 ++(*buffer_index_);
162 }
163
164 // Used to simulate an unresponsive behaviour of the consumer.
165 void SimulateResponsiveness(bool is_responsive) {
166 is_responsive_ = is_responsive;
167 }
168
169 MockSyncSocket* foreign_socket() { return foreign_socket_; }
170 media::AudioBus* audio_bus() const { return audio_track_bus_.get(); }
171 uint32 buffer_index() { return *buffer_index_; }
172
173 private:
174 bool is_responsive_;
175
176 // Shared memory for the audio and synchronization.
177 scoped_ptr<base::SharedMemory> shared_memory_;
178
179 // Fake sockets and their shared buffer.
180 scoped_ptr<MockSyncSocket::SharedBuffer> shared_buffer_;
181 scoped_ptr<MockSyncSocket> local_socket_;
182 MockSyncSocket* foreign_socket_;
183
184 // Audio bus wrapping the shared memory from the renderer.
185 scoped_ptr<media::AudioBus> audio_track_bus_;
186
187 // Used for synchronization of sent/received buffers.
188 uint32* buffer_index_;
189
190 DISALLOW_COPY_AND_ASSIGN(FakeSpeechRecognizer);
191 };
192
193 } // namespace
194
195 namespace content {
196
197 class SpeechRecognitionAudioSinkTest : public testing::Test {
198 public:
199 SpeechRecognitionAudioSinkTest() {}
200
201 ~SpeechRecognitionAudioSinkTest() {}
202
203 // Initializes the producer and consumer with specified audio parameters.
204 // Returns the minimal number of input audio buffers which need to be captured
205 // before they get sent to the consumer.
206 uint32 Initialize(int input_sample_rate,
207 int input_frames_per_buffer,
208 int output_sample_rate,
209 int output_frames_per_buffer) {
210 // Audio Environment setup.
211 source_params_.Reset(kInputFormat,
212 kInputChannelLayout,
213 kInputChannels,
214 input_sample_rate,
215 kInputBitsPerSample,
216 input_frames_per_buffer);
217 sink_params_.Reset(kOutputFormat,
218 kOutputChannelLayout,
219 kOutputChannels,
220 output_sample_rate,
221 kOutputBitsPerSample,
222 output_frames_per_buffer);
223 source_data_.reset(new int16[input_frames_per_buffer * kInputChannels]);
224
225 // Prepare the track and audio source.
226 blink::WebMediaStreamTrack blink_track;
227 PrepareBlinkTrackOfType(MEDIA_DEVICE_AUDIO_CAPTURE, &blink_track);
228
229 // Get the native track from the blink track and initialize.
230 native_track_ =
231 static_cast<WebRtcLocalAudioTrack*>(blink_track.extraData());
232 native_track_->OnSetFormat(source_params_);
233
234 // Create and initialize the consumer.
235 recognizer_.reset(new FakeSpeechRecognizer());
236 base::SharedMemoryHandle foreign_memory_handle;
237 recognizer_->Initialize(blink_track, sink_params_, &foreign_memory_handle);
238
239 // Create the producer.
240 scoped_ptr<base::SyncSocket> foreign_socket(recognizer_->foreign_socket());
241 speech_audio_sink_.reset(new SpeechRecognitionAudioSink(
242 blink_track, sink_params_, foreign_memory_handle,
243 foreign_socket.Pass(),
244 base::Bind(&SpeechRecognitionAudioSinkTest::StoppedCallback,
245 base::Unretained(this))));
246
247 // Return number of buffers needed to trigger resampling and consumption.
248 return static_cast<uint32>(std::ceil(
249 static_cast<double>(output_frames_per_buffer * input_sample_rate) /
250 (input_frames_per_buffer * output_sample_rate)));
251 }
252
253 // Mock callback expected to be called when the track is stopped.
254 MOCK_METHOD0(StoppedCallback, void());
255
256 protected:
257 // Prepares a blink track of a given MediaStreamType and attaches the native
258 // track which can be used to capture audio data and pass it to the producer.
259 static void PrepareBlinkTrackOfType(
260 const MediaStreamType device_type,
261 blink::WebMediaStreamTrack* blink_track) {
262 StreamDeviceInfo device_info(device_type, "Mock device",
263 "mock_device_id");
264 MockMediaConstraintFactory constraint_factory;
265 const blink::WebMediaConstraints constraints =
266 constraint_factory.CreateWebMediaConstraints();
267 scoped_refptr<WebRtcAudioCapturer> capturer(
268 WebRtcAudioCapturer::CreateCapturer(-1, device_info, constraints, NULL,
269 NULL));
270 scoped_refptr<WebRtcLocalAudioTrackAdapter> adapter(
271 WebRtcLocalAudioTrackAdapter::Create(std::string(), NULL));
272 scoped_ptr<WebRtcLocalAudioTrack> native_track(
273 new WebRtcLocalAudioTrack(adapter.get(), capturer, NULL));
274 blink::WebMediaStreamSource blink_audio_source;
275 blink_audio_source.initialize(base::UTF8ToUTF16("dummy_source_id"),
276 blink::WebMediaStreamSource::TypeAudio,
277 base::UTF8ToUTF16("dummy_source_name"));
278 MediaStreamSource::SourceStoppedCallback cb;
279 blink_audio_source.setExtraData(
280 new MediaStreamAudioSource(-1, device_info, cb, NULL));
281 blink_track->initialize(blink::WebString::fromUTF8("dummy_track"),
282 blink_audio_source);
283 blink_track->setExtraData(native_track.release());
284 }
285
286 // Emulates an audio capture device capturing data from the source.
287 inline void CaptureAudio(const uint32 buffers) {
288 for (uint32 i = 0; i < buffers; ++i)
289 native_track()->Capture(source_data(),
290 base::TimeDelta::FromMilliseconds(0), 1, false,
291 false);
292 }
293
294 // Used to simulate a problem with sockets.
295 void SetFailureModeOnForeignSocket(bool in_failure_mode) {
296 recognizer()->foreign_socket()->SetFailureMode(in_failure_mode);
297 }
298
299 // Helper method for verifying captured audio data has been consumed.
300 inline void AssertConsumedBuffers(const uint32 buffer_index) {
301 ASSERT_EQ(buffer_index, recognizer_->buffer_index());
302 }
303
304 // Helper method for providing audio data to producer and verifying it was
305 // consumed on the recognizer.
306 inline void CaptureAudioAndAssertConsumedBuffers(const uint32 buffers,
307 const uint32 buffer_index) {
308 CaptureAudio(buffers);
309 AssertConsumedBuffers(buffer_index);
310 }
311
312 // Helper method to capture and assert consumption at different sample rates
313 // and audio buffer sizes.
314 inline void AssertConsumptionForAudioParameters(
315 const int input_sample_rate,
316 const int input_frames_per_buffer,
317 const int output_sample_rate,
318 const int output_frames_per_buffer,
319 const uint32 consumptions) {
320 const uint32 kBuffersPerNotification =
321 Initialize(input_sample_rate, input_frames_per_buffer,
322 output_sample_rate, output_frames_per_buffer);
323 AssertConsumedBuffers(0U);
324
325 for (uint32 i = 1U; i <= consumptions; ++i) {
326 CaptureAudio(kBuffersPerNotification);
327 ASSERT_EQ(i, recognizer_->buffer_index())
328 << "Tested at rates: "
329 << "In(" << input_sample_rate << ", " << input_frames_per_buffer
330 << ") "
331 << "Out(" << output_sample_rate << ", " << output_frames_per_buffer
332 << ")";
333 }
334 }
335
336 int16* source_data() { return source_data_.get(); }
337
338 FakeSpeechRecognizer* recognizer() { return recognizer_.get(); }
339
340 const media::AudioParameters& sink_params() { return sink_params_; }
341
342 WebRtcLocalAudioTrack* native_track() { return native_track_; }
343
344 private:
345 // Producer.
346 scoped_ptr<SpeechRecognitionAudioSink> speech_audio_sink_;
347
348 // Consumer.
349 scoped_ptr<FakeSpeechRecognizer> recognizer_;
350
351 // Audio related members.
352 scoped_ptr<int16[]> source_data_;
353 media::AudioParameters source_params_;
354 media::AudioParameters sink_params_;
355 WebRtcLocalAudioTrack* native_track_;
356
357 DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionAudioSinkTest);
358 };
359
360 // Not all types of tracks are supported. This test checks if that policy is
361 // implemented correctly.
362 TEST_F(SpeechRecognitionAudioSinkTest, CheckIsSupportedAudioTrack) {
363 typedef std::map<MediaStreamType, bool> SupportedTrackPolicy;
364
365 // This test must be aligned with the policy of supported tracks.
366 SupportedTrackPolicy p;
367 p[MEDIA_NO_SERVICE] = false;
368 p[MEDIA_DEVICE_AUDIO_CAPTURE] = true; // The only one supported for now.
369 p[MEDIA_DEVICE_VIDEO_CAPTURE] = false;
370 p[MEDIA_TAB_AUDIO_CAPTURE] = false;
371 p[MEDIA_TAB_VIDEO_CAPTURE] = false;
372 p[MEDIA_DESKTOP_VIDEO_CAPTURE] = false;
373 p[MEDIA_LOOPBACK_AUDIO_CAPTURE] = false;
374 p[MEDIA_DEVICE_AUDIO_OUTPUT] = false;
375
376 // Ensure this test gets updated along with |content::MediaStreamType| enum.
377 EXPECT_EQ(NUM_MEDIA_TYPES, p.size());
378
379 // Check the the entire policy.
380 for (SupportedTrackPolicy::iterator it = p.begin(); it != p.end(); ++it) {
381 blink::WebMediaStreamTrack blink_track;
382 PrepareBlinkTrackOfType(it->first, &blink_track);
383 ASSERT_EQ(
384 it->second,
385 SpeechRecognitionAudioSink::IsSupportedTrack(blink_track));
386 }
387 }
388
389 // Checks if the producer can support the listed range of input sample rates
390 // and associated buffer sizes.
391 TEST_F(SpeechRecognitionAudioSinkTest, RecognizerNotifiedOnSocket) {
392 const size_t kNumAudioParamTuples = 24;
393 const int kAudioParams[kNumAudioParamTuples][2] = {
394 {8000, 80}, {8000, 800}, {16000, 160}, {16000, 1600},
395 {24000, 240}, {24000, 2400}, {32000, 320}, {32000, 3200},
396 {44100, 441}, {44100, 4410}, {48000, 480}, {48000, 4800},
397 {96000, 960}, {96000, 9600}, {11025, 111}, {11025, 1103},
398 {22050, 221}, {22050, 2205}, {88200, 882}, {88200, 8820},
399 {176400, 1764}, {176400, 17640}, {192000, 1920}, {192000, 19200}};
400
401 // Check all listed tuples of input sample rates and buffers sizes.
402 for (size_t i = 0; i < kNumAudioParamTuples; ++i) {
403 AssertConsumptionForAudioParameters(
404 kAudioParams[i][0], kAudioParams[i][1],
405 kSpeechRecognitionSampleRate, kSpeechRecognitionFramesPerBuffer, 3U);
406 }
407 }
408
409 // Checks that the input data is getting resampled to the target sample rate.
410 TEST_F(SpeechRecognitionAudioSinkTest, AudioDataIsResampledOnSink) {
411 EXPECT_GE(kInputChannels, 1);
412 EXPECT_GE(kOutputChannels, 1);
413
414 // Input audio is sampled at 44.1 KHz with data chunks of 10ms. Desired output
415 // is corresponding to the speech recognition engine requirements: 16 KHz with
416 // 100 ms chunks (1600 frames per buffer).
417 const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);
418 // Fill audio input frames with 0, 1, 2, 3, ..., 440.
419 const uint32 kSourceDataLength = 441 * kInputChannels;
420 for (uint32 i = 0; i < kSourceDataLength; ++i) {
421 for (int c = 0; c < kInputChannels; ++c)
422 source_data()[i * kInputChannels + c] = i;
423 }
424
425 // Prepare sink audio bus and data for rendering.
426 media::AudioBus* sink_bus = recognizer()->audio_bus();
427 const uint32 kSinkDataLength = 1600 * kOutputChannels;
428 int16 sink_data[kSinkDataLength] = {0};
429
430 // Render the audio data from the recognizer.
431 sink_bus->ToInterleaved(sink_bus->frames(),
432 sink_params().bits_per_sample() / 8, sink_data);
433
434 // Checking only a fraction of the sink frames.
435 const uint32 kNumFramesToTest = 12;
436
437 // Check all channels are zeroed out before we trigger resampling.
438 for (uint32 i = 0; i < kNumFramesToTest; ++i) {
439 for (int c = 0; c < kOutputChannels; ++c)
440 EXPECT_EQ(0, sink_data[i * kOutputChannels + c]);
441 }
442
443 // Trigger the speech sink to resample the input data.
444 AssertConsumedBuffers(0U);
445 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
446
447 // Render the audio data from the recognizer.
448 sink_bus->ToInterleaved(sink_bus->frames(),
449 sink_params().bits_per_sample() / 8, sink_data);
450
451 // Resampled data expected frames. Extracted based on |source_data()|.
452 const int16 kExpectedData[kNumFramesToTest] = {0, 2, 5, 8, 11, 13,
453 16, 19, 22, 24, 27, 30};
454
455 // Check all channels have the same resampled data.
456 for (uint32 i = 0; i < kNumFramesToTest; ++i) {
457 for (int c = 0; c < kOutputChannels; ++c)
458 EXPECT_EQ(kExpectedData[i], sink_data[i * kOutputChannels + c]);
459 }
460 }
461
462 // Checks that the producer does not misbehave when a socket failure occurs.
463 TEST_F(SpeechRecognitionAudioSinkTest, SyncSocketFailsSendingData) {
464 const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);
465 // Start with no problems on the socket.
466 AssertConsumedBuffers(0U);
467 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
468
469 // A failure occurs (socket cannot send).
470 SetFailureModeOnForeignSocket(true);
471 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
472 }
473
474 // Checks that an OnStoppedCallback is issued when the track is stopped.
475 TEST_F(SpeechRecognitionAudioSinkTest, OnReadyStateChangedOccured) {
476 const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);
477 AssertConsumedBuffers(0U);
478 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
479 EXPECT_CALL(*this, StoppedCallback()).Times(1);
480
481 native_track()->Stop();
482 CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
483 }
484
485 } // namespace content
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698