content/common/gpu/media/vt_video_decode_accelerator.cc - Issue 397883002: Implement actually decoding frames in VTVideoDecodeAccelerator.

Unified Diff: content/common/gpu/media/vt_video_decode_accelerator.cc

Issue 397883002: Implement actually decoding frames in VTVideoDecodeAccelerator. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fix race on coded_size_. Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: content/common/gpu/media/vt_video_decode_accelerator.cc

diff --git a/content/common/gpu/media/vt_video_decode_accelerator.cc b/content/common/gpu/media/vt_video_decode_accelerator.cc

index 7745e6dfb999aacc259f50033d856d2aded7d28a..96bd68780b4e202af4b1eadac20150b42ae00fdc 100644

--- a/content/common/gpu/media/vt_video_decode_accelerator.cc

+++ b/content/common/gpu/media/vt_video_decode_accelerator.cc

@@ -17,9 +17,14 @@ using content_common_gpu_media::StubPathMap;

namespace content {

-// Size of length headers prepended to NALUs in MPEG-4 framing. (1, 2, or 4.)

+// Size of NALU length headers in AVCC/MPEG-4 format (can be 1, 2, or 4).

static const int kNALUHeaderLength = 4;

+// We only request 5 picture buffers from the client which are used to hold the

+// decoded samples. These buffers are then reused when the client tells us that

+// it is done with the buffer.

+static const int kNumPictureBuffers = 5;

// Route decoded frame callbacks back into the VTVideoDecodeAccelerator.

static void OutputThunk(

void* decompression_output_refcon,

@@ -29,22 +34,31 @@ static void OutputThunk(

CVImageBufferRef image_buffer,

CMTime presentation_time_stamp,

CMTime presentation_duration) {

+ // TODO(sandersd): Implement flush-before-delete to guarantee validity.

VTVideoDecodeAccelerator* vda =

reinterpret_cast<VTVideoDecodeAccelerator*>(decompression_output_refcon);

- int32_t* bitstream_id_ptr = reinterpret_cast<int32_t*>(source_frame_refcon);

- int32_t bitstream_id = *bitstream_id_ptr;

- delete bitstream_id_ptr;

- CFRetain(image_buffer);

- vda->Output(bitstream_id, status, info_flags, image_buffer);

+ intptr_t bitstream_id = reinterpret_cast<intptr_t>(source_frame_refcon);

+ vda->Output(bitstream_id, status, image_buffer);

+VTVideoDecodeAccelerator::DecodedFrame::DecodedFrame(

+ uint32_t bitstream_id,

+ CVImageBufferRef image_buffer)

+ : bitstream_id(bitstream_id),

+ image_buffer(image_buffer) {

+VTVideoDecodeAccelerator::DecodedFrame::~DecodedFrame() {

}

VTVideoDecodeAccelerator::VTVideoDecodeAccelerator(CGLContextObj cgl_context)

: cgl_context_(cgl_context),

client_(NULL),

- decoder_thread_("VTDecoderThread"),

format_(NULL),

session_(NULL),

- weak_this_factory_(this) {

+ gpu_task_runner_(base::ThreadTaskRunnerHandle::Get()),

+ weak_this_factory_(this),

+ decoder_thread_("VTDecoderThread") {

callback_.decompressionOutputCallback = OutputThunk;

callback_.decompressionOutputRefCon = this;

}

@@ -88,6 +102,9 @@ bool VTVideoDecodeAccelerator::Initialize(

void VTVideoDecodeAccelerator::ConfigureDecoder(

const std::vector<const uint8_t*>& nalu_data_ptrs,

const std::vector<size_t>& nalu_data_sizes) {

+ DCHECK(decoder_thread_.message_loop_proxy()->BelongsToCurrentThread());

+ // Construct a new format description from the parameter sets.

+ // TODO(sandersd): Replace this with custom code to support OS X < 10.9.

format_.reset();

CHECK(!CMVideoFormatDescriptionCreateFromH264ParameterSets(

kCFAllocatorDefault,

@@ -95,13 +112,11 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(

&nalu_data_ptrs.front(), // &parameter_set_pointers

&nalu_data_sizes.front(), // &parameter_set_sizes

kNALUHeaderLength, // nal_unit_header_length

- format_.InitializeInto()

- ));

- // TODO(sandersd): Check if the size has changed and handle picture requests.

- CMVideoDimensions coded_size = CMVideoFormatDescriptionGetDimensions(format_);

- coded_size_.SetSize(coded_size.width, coded_size.height);

+ format_.InitializeInto()));

+ CMVideoDimensions coded_dimensions =

+ CMVideoFormatDescriptionGetDimensions(format_);

+ // Prepare VideoToolbox configuration dictionaries.

base::ScopedCFTypeRef<CFMutableDictionaryRef> decoder_config(

CFDictionaryCreateMutable(

kCFAllocatorDefault,

@@ -122,12 +137,12 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(

&kCFTypeDictionaryKeyCallBacks,

&kCFTypeDictionaryValueCallBacks));

- // TODO(sandersd): ARGB for video that is not 4:2:0.

- int32_t pixel_format = '2vuy';

#define CFINT(i) CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &i)

+ // TODO(sandersd): RGBA option for 4:4:4 video.

+ int32_t pixel_format = kCVPixelFormatType_422YpCbCr8;

base::ScopedCFTypeRef<CFNumberRef> cf_pixel_format(CFINT(pixel_format));

- base::ScopedCFTypeRef<CFNumberRef> cf_width(CFINT(coded_size.width));

- base::ScopedCFTypeRef<CFNumberRef> cf_height(CFINT(coded_size.height));

+ base::ScopedCFTypeRef<CFNumberRef> cf_width(CFINT(coded_dimensions.width));

+ base::ScopedCFTypeRef<CFNumberRef> cf_height(CFINT(coded_dimensions.height));

#undef CFINT

CFDictionarySetValue(

image_config, kCVPixelBufferPixelFormatTypeKey, cf_pixel_format);

@@ -136,8 +151,8 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(

CFDictionarySetValue(

image_config, kCVPixelBufferOpenGLCompatibilityKey, kCFBooleanTrue);

- // TODO(sandersd): Skip if the session is compatible.

- // TODO(sandersd): Flush frames when resetting.

+ // TODO(sandersd): Check if the session is already compatible.

+ // TODO(sandersd): Flush.

session_.reset();

CHECK(!VTDecompressionSessionCreate(

kCFAllocatorDefault,

@@ -145,9 +160,17 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(

decoder_config, // video_decoder_specification

image_config, // destination_image_buffer_attributes

&callback_, // output_callback

- session_.InitializeInto()

- ));

- DVLOG(2) << "Created VTDecompressionSession";

+ session_.InitializeInto()));

+ // If the size has changed, trigger a request for new picture buffers.

+ gfx::Size new_coded_size(coded_dimensions.width, coded_dimensions.height);

+ if (coded_size_ != new_coded_size) {

Pawel Osciak 2014/07/18 01:22:12 Since you don't do dismiss/allow resolution change

sandersd (OOO until July 31) 2014/07/18 21:50:14 Resolution change will be supported as soon as I f

Pawel Osciak 2014/07/22 11:19:35 What exactly do you need to do? H264Parser stores

sandersd (OOO until July 31) 2014/07/22 19:00:23 I need to know which SPS/SPSExt/PPS records to bun

+ coded_size_ = new_coded_size;

+ gpu_task_runner_->PostTask(FROM_HERE, base::Bind(

+ &VTVideoDecodeAccelerator::SizeChangedTask,

+ weak_this_factory_.GetWeakPtr(),

+ coded_size_));;

+ }

}

void VTVideoDecodeAccelerator::Decode(const media::BitstreamBuffer& bitstream) {

@@ -167,7 +190,12 @@ void VTVideoDecodeAccelerator::DecodeTask(

CHECK(memory.Map(size));

const uint8_t* buf = static_cast<uint8_t*>(memory.memory());

- // Locate relevant NALUs in the buffer.

+ // NALUs are stored with Annex B format in the bitstream buffer (3-byte start

Pawel Osciak 2014/07/18 01:22:12 Actually, start codes can be either 3- or 4-byte.

sandersd (OOO until July 31) 2014/07/18 21:50:14 Done.

+ // codes), but VideoToolbox expects AVCC/MPEG-4 format (length headers), so we

+ // must to rewrite the data.

+ //

+ // 1. Locate relevant NALUs and compute the size of the translated data.

+ // Also record any parameter sets for VideoToolbox initialization.

size_t data_size = 0;

std::vector<media::H264NALU> nalus;

std::vector<const uint8_t*> config_nalu_data_ptrs;

@@ -179,40 +207,167 @@ void VTVideoDecodeAccelerator::DecodeTask(

if (result == media::H264Parser::kEOStream)

break;

CHECK_EQ(result, media::H264Parser::kOk);

+ // TODO(sandersd): Check that these are only at the start.

if (nalu.nal_unit_type == media::H264NALU::kSPS ||

nalu.nal_unit_type == media::H264NALU::kPPS ||

nalu.nal_unit_type == media::H264NALU::kSPSExt) {

+ DVLOG(2) << "Parameter set " << nalu.nal_unit_type;

config_nalu_data_ptrs.push_back(nalu.data);

config_nalu_data_sizes.push_back(nalu.size);

+ } else {

+ nalus.push_back(nalu);

+ data_size += kNALUHeaderLength + nalu.size;

}

- nalus.push_back(nalu);

- // Each NALU will have a 4-byte length header prepended.

- data_size += kNALUHeaderLength + nalu.size;

}

- if (!config_nalu_data_ptrs.empty())

+ // 2. Initialize VideoToolbox.

+ // TODO(sandersd): Reinitialize when there are new parameter sets.

+ if (!session_)

ConfigureDecoder(config_nalu_data_ptrs, config_nalu_data_sizes);

- // TODO(sandersd): Rewrite slice NALU headers and send for decoding.

+ // 3. Allocate a memory-backed CMBlockBuffer for the translated data.

+ base::ScopedCFTypeRef<CMBlockBufferRef> data;

+ CHECK(!CMBlockBufferCreateWithMemoryBlock(

+ kCFAllocatorDefault,

+ NULL, // &memory_block

+ data_size, // block_length

+ kCFAllocatorDefault, // block_allocator

+ NULL, // &custom_block_source

+ 0, // offset_to_data

+ data_size, // data_length

+ 0, // flags

+ data.InitializeInto()));

+ // 4. Copy NALU data, inserting length headers.

+ size_t offset = 0;

+ for (size_t i = 0; i < nalus.size(); i++) {

+ media::H264NALU& nalu = nalus[i];

+ uint8_t header[4] = {0xff & nalu.size >> 24,

+ 0xff & nalu.size >> 16,

+ 0xff & nalu.size >> 8,

+ 0xff & nalu.size};

+ CHECK(!CMBlockBufferReplaceDataBytes(header, data, offset, 4));

+ offset += 4;

+ CHECK(!CMBlockBufferReplaceDataBytes(nalu.data, data, offset, nalu.size));

+ offset += nalu.size;

+ }

+ // 5. Package the data for VideoToolbox and request decoding.

+ base::ScopedCFTypeRef<CMSampleBufferRef> frame;

+ CHECK(!CMSampleBufferCreate(

+ kCFAllocatorDefault,

+ data, // data_buffer

+ true, // data_ready

+ NULL, // make_data_ready_callback

+ NULL, // make_data_ready_refcon

+ format_, // format_description

+ 1, // num_samples

+ 0, // num_sample_timing_entries

+ NULL, // &sample_timing_array

+ 0, // num_sample_size_entries

+ NULL, // &sample_size_array

+ frame.InitializeInto()));

+ VTDecodeFrameFlags decode_flags =

+ kVTDecodeFrame_EnableAsynchronousDecompression |

+ kVTDecodeFrame_EnableTemporalProcessing;

+ intptr_t bitstream_id = bitstream.id();

+ CHECK(!VTDecompressionSessionDecodeFrame(

+ session_,

+ frame, // sample_buffer

+ decode_flags, // decode_flags

+ reinterpret_cast<void*>(bitstream_id), // source_frame_refcon

+ NULL)); // &info_flags_out

}

// This method may be called on any VideoToolbox thread.

void VTVideoDecodeAccelerator::Output(

int32_t bitstream_id,

OSStatus status,

- VTDecodeInfoFlags info_flags,

CVImageBufferRef image_buffer) {

- // TODO(sandersd): Store the frame in a queue.

- CFRelease(image_buffer);

+ CHECK(!status);

+ CHECK_EQ(CFGetTypeID(image_buffer), CVPixelBufferGetTypeID());

+ CFRetain(image_buffer);

+ gpu_task_runner_->PostTask(FROM_HERE, base::Bind(

+ &VTVideoDecodeAccelerator::OutputTask,

+ weak_this_factory_.GetWeakPtr(),

Pawel Osciak 2014/07/18 01:22:12 Perhaps use one cached weakptr instead of generati

sandersd (OOO until July 31) 2014/07/18 21:50:14 I don't see much difference between copying a Weak

Pawel Osciak 2014/07/22 11:19:35 It's about correctness. See https://code.google.co

sandersd (OOO until July 31) 2014/07/22 19:00:23 The pointers are not bound to the thread they are

+ DecodedFrame(bitstream_id, image_buffer)));

+void VTVideoDecodeAccelerator::OutputTask(DecodedFrame frame) {

+ DCHECK(CalledOnValidThread());

+ decoded_frames_.push(frame);

+ SendPictures();

+void VTVideoDecodeAccelerator::SizeChangedTask(gfx::Size coded_size) {

+ texture_size_ = coded_size;

+ // TODO(sandersd): Dismiss existing picture buffers.

+ client_->ProvidePictureBuffers(

Pawel Osciak 2014/07/18 01:22:12 Do we want if (client_) ?

sandersd (OOO until July 31) 2014/07/18 21:50:14 This is always safe.

Pawel Osciak 2014/07/22 11:19:35 When I ask questions like this, I usually imply "i

sandersd (OOO until July 31) 2014/07/22 19:00:23 Acknowledged.

+ kNumPictureBuffers, texture_size_, GL_TEXTURE_RECTANGLE_ARB);

}

void VTVideoDecodeAccelerator::AssignPictureBuffers(

const std::vector<media::PictureBuffer>& pictures) {

DCHECK(CalledOnValidThread());

+ for (size_t i = 0; i < pictures.size(); i++) {

+ picture_ids_.push(pictures[i].id());

+ texture_ids_[pictures[i].id()] = pictures[i].texture_id();

+ }

+ // Pictures are not marked as uncleared until this method returns. They will

+ // become broken if they are used before that happens.

+ gpu_task_runner_->PostTask(FROM_HERE, base::Bind(

+ &VTVideoDecodeAccelerator::SendPictures,

+ weak_this_factory_.GetWeakPtr()));

}

void VTVideoDecodeAccelerator::ReusePictureBuffer(int32_t picture_id) {

DCHECK(CalledOnValidThread());

+ DCHECK_EQ(CFGetRetainCount(picture_bindings_[picture_id]), 1);

+ picture_bindings_.erase(picture_id);

+ picture_ids_.push(picture_id);

+ SendPictures();

+void VTVideoDecodeAccelerator::SendPictures() {

+ DCHECK(CalledOnValidThread());

+ if (picture_ids_.empty() || decoded_frames_.empty())

+ return;

+ CGLContextObj prev_context = CGLGetCurrentContext();

+ CHECK(!CGLSetCurrentContext(cgl_context_));

+ glEnable(GL_TEXTURE_RECTANGLE_ARB);

+ while (!picture_ids_.empty() && !decoded_frames_.empty()) {

+ int32_t picture_id = picture_ids_.front();

+ picture_ids_.pop();

+ DecodedFrame frame = decoded_frames_.front();

+ decoded_frames_.pop();

+ IOSurfaceRef surface = CVPixelBufferGetIOSurface(frame.image_buffer);

+ glBindTexture(GL_TEXTURE_RECTANGLE_ARB, texture_ids_[picture_id]);

Pawel Osciak 2014/07/18 01:22:12 Please use ScopedPictureBinder so that the current

sandersd (OOO until July 31) 2014/07/18 21:50:14 Done. This is an area I don't know much about and

Pawel Osciak 2014/07/22 11:19:35 I'm not an expert, but a rule of thumb is to alway

sandersd (OOO until July 31) 2014/07/22 19:00:23 Makes sense. It crashes if I don't restore though,

+ CHECK(!CGLTexImageIOSurface2D(

+ cgl_context_, // ctx

+ GL_TEXTURE_RECTANGLE_ARB, // target

+ GL_RGB, // internal_format

+ texture_size_.width(), // width

+ texture_size_.height(), // height

+ GL_YCBCR_422_APPLE, // format

+ GL_UNSIGNED_SHORT_8_8_APPLE, // type

+ surface, // io_surface

+ 0)); // plane

+ glBindTexture(GL_TEXTURE_RECTANGLE_ARB, 0);

+ picture_bindings_[picture_id] = frame.image_buffer;

+ client_->PictureReady(media::Picture(picture_id, frame.bitstream_id));

+ client_->NotifyEndOfBitstreamBuffer(frame.bitstream_id);

Pawel Osciak 2014/07/18 01:22:12 Could you do this in output task? What happens whe

sandersd (OOO until July 31) 2014/07/18 21:50:14 Re-ordering of out of order frames is an unsolved

Pawel Osciak 2014/07/22 11:19:35 The client will keep the VDA fed with a limited nu

sandersd (OOO until July 31) 2014/07/22 19:00:23 "Again" was parenthetical because it does not appl

"Again" was parenthetical because it does not apply in this case, but the last iteration of the PPAPI I saw did not have that restriction. Client does implement rate limiting, but it does it by tracking the number of bitstream buffers that have been sent for decoding that it has not been notified are done with via. NotifyEndOfBitstreamBuffer. You can see here that the |done_cb| is called as part of that callback: https://code.google.com/p/chromium/codesearch#chromium/src/media/filters/gpu_... When I experimented with this, ACKing sooner resulted in extreme system load while it tried to demux and decode at an unbounded rate.

Pawel Osciak 2014/07/23 04:14:23 The key here is that rate limiting is done on the

On 2014/07/22 19:00:23, sandersd wrote: > On 2014/07/22 11:19:35, Pawel Osciak wrote: > > On 2014/07/18 21:50:14, sandersd wrote: > > > On 2014/07/18 01:22:12, Pawel Osciak wrote: > > > > Could you do this in output task? What happens when frames are out of > order, > > > > we'd want to return bitstream buffers faster then their frames are output > > > > probably. > > > > > > Re-ordering of out of order frames is an unsolved problem right now, but if > > > these are decoupled then unlimited frames are sent for decoding. > > > > > > > The client will keep the VDA fed with a limited number of input buffers at a > > time. If reordering happens, you may potentially risk performance issues by > not > > keeping your input pipeline full enough, or starving/freezing yourself in the > > worst case, depending on the choice of input buffer count by the client. > > > > There is no low limit on the input buffers you may be fed with in the API, so > if > > a client decides to give you only a small number, you may just deadlock > yourself > > waiting for inputs. > > > > > I have been interpreting the API as: NotifyEndOfBitstreamBuffer implies > > > PictureReady will not be called (again) for the bitstream_id. > > > > No, this is not true. NotifyEndOfBitstreamBuffer means I'm done with the input > > you gave me, I'm ready for more. There is no relation to PictureReady > > whatsoever, and there should be no relation between handling of inputs and > > outputs, they should be separate. > > > > I'm not sure what you mean by "again"? PictureReady is called only once for > each > > id... > > > > > Obviously not what > > > all the VDAs are doing right now, but very similar to PPAPI. > > > This results in up > > > to kMaxInFlightDecodes (4) pending frames. > > > > > > The alternative is to implement our own rate limiting. > > > > What do you mean by rate limiting? Limiting the number of inputs in flight? > > Client will do this for you. > > > > "Again" was parenthetical because it does not apply in this case, but the last > iteration of the PPAPI I saw did not have that restriction. > > Client does implement rate limiting, but it does it by tracking the number of > bitstream buffers that have been sent for decoding that it has not been notified > are done with via. NotifyEndOfBitstreamBuffer. > > You can see here that the |done_cb| is called as part of that callback: > https://code.google.com/p/chromium/codesearch#chromium/src/media/filters/gpu_... > > When I experimented with this, ACKing sooner resulted in extreme system load > while it tried to demux and decode at an unbounded rate.

The key here is that rate limiting is done on the outputs by the client. The client will display and return PictureBuffers according to the timestamps in the stream. Rate-limiting inputs on buffer IDs in output order may result in problems I mentioned above, because the output/display order may not be the same as decode order. Moreover, if the stream does not provide max_num_reorder_frames syntax element and you can't infer that from H.264 profile (https://code.google.com/p/chromium/codesearch#chromium/src/content/common/gpu...), the decoder has to fill the whole DPB before it can output (sliding window over DPB contents), because it doesn't know if a negative POC will be given to it a bit later on. This means it will have to decode at least DPB size of frames before it can output a frame. If the client allows less than DPB size input buffers in flight, you will probably deadlock. Usually videos on YT etc. include max_num_reorder_frames syntax element, but this is not a requirement of the standard. In your case you should simply just limit yourself on the outputs, but not return inputs in output order. I would suggest just holding off decode if no output buffers are available.

sandersd (OOO until July 31) 2014/07/23 18:03:39 Currently not true (it's still in delivery order),

+ }

+ glDisable(GL_TEXTURE_RECTANGLE_ARB);

+ CHECK(!CGLSetCurrentContext(prev_context));

}

void VTVideoDecodeAccelerator::Flush() {

« content/common/gpu/media/vt_video_decode_accelerator.h ('K') | « content/common/gpu/media/vt_video_decode_accelerator.h ('k') | no next file » | no next file with comments »