Index: doc/draft-ietf-codec-oggopus.xml |
diff --git a/doc/draft-ietf-codec-oggopus.xml b/doc/draft-ietf-codec-oggopus.xml |
index 6131e69ed5aaa8179ee6fe6b4b84a747cd26ce3f..e5181c92ebc6ad0454cfaa9874abfd90403c8ca0 100644 |
--- a/doc/draft-ietf-codec-oggopus.xml |
+++ b/doc/draft-ietf-codec-oggopus.xml |
@@ -1,16 +1,17 @@ |
<?xml version="1.0" encoding="utf-8"?> |
<!DOCTYPE rfc SYSTEM 'rfc2629.dtd' [ |
-<!ENTITY rfc2119 PUBLIC '' 'https://xml2rfc.tools.ietf.org/tools/xml2rfc/public/rfc/bibxml/reference.RFC.2119.xml'> |
-<!ENTITY rfc3533 PUBLIC '' 'https://xml2rfc.tools.ietf.org/tools/xml2rfc/public/rfc/bibxml/reference.RFC.3533.xml'> |
-<!ENTITY rfc3629 PUBLIC '' 'https://xml2rfc.tools.ietf.org/tools/xml2rfc/public/rfc/bibxml/reference.RFC.3629.xml'> |
-<!ENTITY rfc4732 PUBLIC '' 'https://xml2rfc.tools.ietf.org/tools/xml2rfc/public/rfc/bibxml/reference.RFC.4732.xml'> |
-<!ENTITY rfc5334 PUBLIC '' 'https://xml2rfc.tools.ietf.org/tools/xml2rfc/public/rfc/bibxml/reference.RFC.5334.xml'> |
-<!ENTITY rfc6381 PUBLIC '' 'https://xml2rfc.tools.ietf.org/tools/xml2rfc/public/rfc/bibxml/reference.RFC.6381.xml'> |
-<!ENTITY rfc6716 PUBLIC '' 'https://xml2rfc.tools.ietf.org/tools/xml2rfc/public/rfc/bibxml/reference.RFC.6716.xml'> |
+<!ENTITY rfc2119 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml'> |
+<!ENTITY rfc3533 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3533.xml'> |
+<!ENTITY rfc3629 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml'> |
+<!ENTITY rfc4732 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4732.xml'> |
+<!ENTITY rfc5334 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5334.xml'> |
+<!ENTITY rfc6381 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6381.xml'> |
+<!ENTITY rfc6716 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml'> |
+<!ENTITY rfc6982 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6982.xml'> |
]> |
<?rfc toc="yes" symrefs="yes" ?> |
-<rfc ipr="trust200902" category="std" docName="draft-ietf-codec-oggopus-01"> |
+<rfc ipr="trust200902" category="std" docName="draft-ietf-codec-oggopus-06"> |
<front> |
<title abbrev="Ogg Opus">Ogg Encapsulation for the Opus Audio Codec</title> |
@@ -54,12 +55,12 @@ |
<code>V6B 1H5</code> |
<country>Canada</country> |
</postal> |
-<phone>+1 604 778 1540</phone> |
+<phone>+1 778 785 1540</phone> |
<email>giles@xiph.org</email> |
</address> |
</author> |
-<date day="24" month="May" year="2013"/> |
+<date day="18" month="October" year="2014"/> |
<area>RAI</area> |
<workgroup>codec</workgroup> |
@@ -100,7 +101,7 @@ Each page is associated with a particular logical stream and contains a capture |
stream, to aid seeking. |
A single page can contain up to 65,025 octets of packet data from up to 255 |
different packets. |
-Packets may be split arbitrarily across pages, and continued from one page to |
+Packets MAY be split arbitrarily across pages, and continued from one page to |
the next (allowing packets much larger than would fit on a single page). |
Each page contains 'lacing values' that indicate how the data is partitioned |
into packets, allowing a demuxer to recover the packet boundaries without |
@@ -109,7 +110,7 @@ A packet is said to 'complete' on a page when the page contains the final |
lacing value corresponding to that packet. |
</t> |
<t> |
-This encapsulation defines the required contents of the packet data, including |
+This encapsulation defines the contents of the packet data, including |
the necessary headers, the organization of those packets into a logical |
stream, and the interpretation of the codec-specific granule position field. |
It does not attempt to describe or specify the existing Ogg container format. |
@@ -122,8 +123,8 @@ Readers unfamiliar with the basic concepts mentioned above are encouraged to |
<section anchor="terminology" title="Terminology"> |
<t> |
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", |
- "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be |
- interpreted as described in <xref target="RFC2119"/>. |
+ "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this |
+ document are to be interpreted as described in <xref target="RFC2119"/>. |
</t> |
<t> |
@@ -138,7 +139,7 @@ All other implementations are "unconditionally compliant". |
<section anchor="packet_organization" title="Packet Organization"> |
<t> |
-An Opus stream is organized as follows. |
+An Ogg Opus stream is organized as follows. |
</t> |
<t> |
There are two mandatory header packets. |
@@ -149,7 +150,7 @@ The first packet in the logical Ogg bitstream MUST contain the identification |
(ID) header, which uniquely identifies a stream as Opus audio. |
The format of this header is defined in <xref target="id_header"/>. |
It MUST be placed alone (without any other packet data) on the first page of |
- the logical Ogg bitstream, and must complete on that page. |
+ the logical Ogg bitstream, and MUST complete on that page. |
This page MUST have its 'beginning of stream' flag set. |
</t> |
<t> |
@@ -165,8 +166,8 @@ However many pages it spans, the comment header packet MUST finish the page on |
All subsequent pages are audio data pages, and the Ogg packets they contain are |
audio data packets. |
Each audio data packet contains one Opus packet for each of N different |
- streams, where N is typically one for mono or stereo, but may be greater than |
- one for, e.g., multichannel audio. |
+ streams, where N is typically one for mono or stereo, but MAY be greater than |
+ one for multichannel audio. |
The value N is specified in the ID header (see |
<xref target="channel_mapping"/>), and is fixed over the entire length of the |
logical Ogg bitstream. |
@@ -179,11 +180,17 @@ The remaining Opus packet is packed at the end of the Ogg packet using the |
regular, undelimited framing from Section 3 of <xref target="RFC6716"/>. |
All of the Opus packets in a single Ogg packet MUST be constrained to have the |
same duration. |
-The duration and coding modes of each Opus packet are contained in the |
- TOC (table of contents) sequence in the first few bytes. |
A decoder SHOULD treat any Opus packet whose duration is different from that of |
- the first Opus packet in an Ogg packet as if it were an Opus packet with an |
- illegal TOC sequence. |
+ the first Opus packet in an Ogg packet as if it were a malformed Opus packet |
+ with an invalid TOC sequence. |
+</t> |
+<t> |
+The coding mode (SILK, Hybrid, or CELT), audio bandwidth, channel count, |
+ duration (frame size), and number of frames per packet, are indicated in the |
+ TOC (table of contents) sequence at the beginning of each Opus packet, as |
+ described in Section 3.1 of <xref target="RFC6716"/>. |
+The combination of mode, audio bandwidth, and frame size is referred to as |
+ the configuration of an Opus packet. |
</t> |
<t> |
The first audio data page SHOULD NOT have the 'continued packet' flag set |
@@ -191,13 +198,15 @@ The first audio data page SHOULD NOT have the 'continued packet' flag set |
page). |
Packets MUST be placed into Ogg pages in order until the end of stream. |
Audio packets MAY span page boundaries. |
-A decoder MUST treat a zero-octet audio data packet as if it were an Opus |
- packet with an illegal TOC sequence. |
+A decoder MUST treat a zero-octet audio data packet as if it were a malformed |
+ Opus packet as described in Section 3.4 of <xref target="RFC6716"/>. |
+</t> |
+<t> |
The last page SHOULD have the 'end of stream' flag set, but implementations |
- should be prepared to deal with truncated streams that do not have a page |
+ need to be prepared to deal with truncated streams that do not have a page |
marked 'end of stream'. |
The final packet on the last page SHOULD NOT be a continued packet, i.e., the |
- final lacing value should be less than 255. |
+ final lacing value SHOULD be less than 255. |
There MUST NOT be any more pages in an Opus logical bitstream after a page |
marked 'end of stream'. |
</t> |
@@ -223,7 +232,7 @@ It is possible to run an Opus decoder at other sampling rates, but the value |
</t> |
<t> |
-The duration of an Opus packet may be any multiple of 2.5 ms, up to a |
+The duration of an Opus packet can be any multiple of 2.5 ms, up to a |
maximum of 120 ms. |
This duration is encoded in the TOC sequence at the beginning of each packet. |
The number of samples returned by a decoder corresponds to this duration |
@@ -245,19 +254,105 @@ All other pages with completed packets after the first MUST have a granule |
This guarantees that a demuxer can assign individual packets the same granule |
position when working forwards as when working backwards. |
For this to work, there cannot be any gaps. |
-In order to support capturing a stream that uses discontinuous transmission |
- (DTX), an encoder SHOULD emit packets that explicitly request the use of |
- Packet Loss Concealment (PLC) (i.e., with a frame length of 0, as defined in |
- Section 3.2.1 of <xref target="RFC6716"/>) in place of the packets that were |
- not transmitted. |
</t> |
+<section anchor="gap-repair" title="Repairing Gaps in Real-time Streams"> |
+<t> |
+In order to support capturing a real-time stream that has lost or not |
+ transmitted packets, a muxer SHOULD emit packets that explicitly request the |
+ use of Packet Loss Concealment (PLC) in place of the missing packets. |
+Only gaps that are a multiple of 2.5 ms are repairable, as these are the |
+ only durations that can be created by packet loss or discontinuous |
+ transmission. |
+Muxers need not handle other gap sizes. |
+Creating the necessary packets involves synthesizing a TOC byte (defined in |
+Section 3.1 of <xref target="RFC6716"/>)—and whatever |
+ additional internal framing is needed—to indicate the packet duration |
+ for each stream. |
+The actual length of each missing Opus frame inside the packet is zero bytes, |
+ as defined in Section 3.2.1 of <xref target="RFC6716"/>. |
+</t> |
+ |
+<t> |
+Zero-byte frames MAY be packed into packets using any of codes 0, 1, |
+ 2, or 3. |
+When successive frames have the same configuration, the higher code packings |
+ reduce overhead. |
+Likewise, if the TOC configuration matches, the muxer MAY further combine the |
+ empty frames with previous or subsequent non-zero-length frames (using |
+ code 2 or VBR code 3). |
+</t> |
+ |
+<t> |
+<xref target="RFC6716"/> does not impose any requirements on the PLC, but this |
+ section outlines choices that are expected to have a positive influence on |
+ most PLC implementations, including the reference implementation. |
+Synthesized TOC sequences SHOULD maintain the same mode, audio bandwidth, |
+ channel count, and frame size as the previous packet (if any). |
+This is the simplest and usually the most well-tested case for the PLC to |
+ handle and it covers all losses that do not include a configuration switch, |
+ as defined in Section 4.5 of <xref target="RFC6716"/>. |
+</t> |
+ |
+<t> |
+When a previous packet is available, keeping the audio bandwidth and channel |
+ count the same allows the PLC to provide maximum continuity in the concealment |
+ data it generates. |
+However, if the size of the gap is not a multiple of the most recent frame |
+ size, then the frame size will have to change for at least some frames. |
+Such changes SHOULD be delayed as long as possible to simplify |
+ things for PLC implementations. |
+</t> |
+ |
+<t> |
+As an example, a 95 ms gap could be encoded as nineteen 5 ms frames |
+ in two bytes with a single CBR code 3 packet. |
+If the previous frame size was 20 ms, using four 20 ms frames |
+ followed by three 5 ms frames requires 4 bytes (plus an extra byte |
+ of Ogg lacing overhead), but allows the PLC to use its well-tested steady |
+ state behavior for as long as possible. |
+The total bitrate of the latter approach, including Ogg overhead, is about |
+ 0.4 kbps, so the impact on file size is minimal. |
+</t> |
+ |
+<t> |
+Changing modes is discouraged, since this causes some decoder implementations |
+ to reset their PLC state. |
+However, SILK and Hybrid mode frames cannot fill gaps that are not a multiple |
+ of 10 ms. |
+If switching to CELT mode is needed to match the gap size, a muxer SHOULD do |
+ so at the end of the gap to allow the PLC to function for as long as possible. |
+</t> |
+ |
+<t> |
+In the example above, if the previous frame was a 20 ms SILK mode frame, |
+ the better solution is to synthesize a packet describing four 20 ms SILK |
+ frames, followed by a packet with a single 10 ms SILK |
+ frame, and finally a packet with a 5 ms CELT frame, to fill the 95 ms |
+ gap. |
+This also requires four bytes to describe the synthesized packet data (two |
+ bytes for a CBR code 3 and one byte each for two code 0 packets) but three |
+ bytes of Ogg lacing overhead are needed to mark the packet boundaries. |
+At 0.6 kbps, this is still a minimal bitrate impact over a naive, low quality |
+ solution. |
+</t> |
+ |
+<t> |
+Since medium-band audio is an option only in the SILK mode, wideband frames |
+ SHOULD be generated if switching from that configuration to CELT mode, to |
+ ensure that any PLC implementation which does try to migrate state between |
+ the modes will be able to preserve all of the available audio bandwidth. |
+</t> |
+ |
+</section> |
+ |
<section anchor="preskip" title="Pre-skip"> |
<t> |
There is some amount of latency introduced during the decoding process, to |
- allow for overlap in the MDCT modes, stereo mixing in the LP modes, and |
- resampling, and the encoder will introduce even more latency (though the exact |
- amount is not specified). |
+ allow for overlap in the CELT mode, stereo mixing in the SILK mode, and |
+ resampling. |
+The encoder might have introduced additional latency through its own resampling |
+ and analysis (though the exact amount is not specified). |
Therefore, the first few samples produced by the decoder do not correspond to |
real input audio, but are instead composed of padding inserted by the encoder |
to compensate for this latency. |
@@ -271,20 +366,39 @@ However, a decoder will want to skip these samples after decoding them. |
A 'pre-skip' field in the ID header (see <xref target="id_header"/>) signals |
the number of samples which SHOULD be skipped (decoded but discarded) at the |
beginning of the stream. |
-This provides sufficient history to the decoder so that it has already |
- converged before the stream's output begins. |
-It may also be used to perform sample-accurate cropping of existing encoded |
- streams. |
-This amount need not be a multiple of 2.5 ms, may be smaller than a single |
- packet, or may span the contents of several packets. |
+This amount need not be a multiple of 2.5 ms, MAY be smaller than a single |
+ packet, or MAY span the contents of several packets. |
+These samples are not valid audio, and SHOULD NOT be played. |
</t> |
+ |
+<t> |
+For example, if the first Opus frame uses the CELT mode, it will always |
+ produce 120 samples of windowed overlap-add data. |
+However, the overlap data is initially all zeros (since there is no prior |
+ frame), meaning this cannot, in general, accurately represent the original |
+ audio. |
+The SILK mode requires additional delay to account for its analysis and |
+ resampling latency. |
+The encoder delays the original audio to avoid this problem. |
+</t> |
+ |
+<t> |
+The pre-skip field MAY also be used to perform sample-accurate cropping of |
+ already encoded streams. |
+In this case, a value of at least 3840 samples (80 ms) provides |
+ sufficient history to the decoder that it will have converged |
+ before the stream's output begins. |
+</t> |
+ |
</section> |
<section anchor="pcm_sample_position" title="PCM Sample Position"> |
<t> |
+<figure align="center"> |
+<preamble> |
The PCM sample position is determined from the granule position using the |
formula |
-<figure align="center"> |
+</preamble> |
<artwork align="center"><![CDATA[ |
'PCM sample position' = 'granule position' - 'pre-skip' . |
]]></artwork> |
@@ -295,8 +409,10 @@ The PCM sample position is determined from the granule position using the |
For example, if the granule position of the first audio data page is 59,971, |
and the pre-skip is 11,971, then the PCM sample position of the last decoded |
sample from that page is 48,000. |
-This can be converted into a playback time using the formula |
<figure align="center"> |
+<preamble> |
+This can be converted into a playback time using the formula |
+</preamble> |
<artwork align="center"><![CDATA[ |
'PCM sample position' |
'playback time' = --------------------- . |
@@ -317,12 +433,12 @@ In this case, the PCM sample position of the first audio sample to be played |
<t> |
Vorbis streams use a granule position smaller than the number of audio samples |
contained in the first audio data page to indicate that some of those samples |
- must be trimmed from the output (see <xref target="vorbis-trim"/>). |
+ are trimmed from the output (see <xref target="vorbis-trim"/>). |
However, to do so, Vorbis requires that the first audio data page contains |
exactly two packets, in order to allow the decoder to perform PCM position |
adjustments before needing to return any PCM data. |
Opus uses the pre-skip mechanism for this purpose instead, since the encoder |
- may introduce more than a single packet's worth of latency, and since very |
+ MAY introduce more than a single packet's worth of latency, and since very |
large packets in streams with a very large number of channels might not fit |
on a single page. |
</t> |
@@ -356,11 +472,11 @@ Allowing a granule position larger than the number of samples allows the |
beginning of a stream to be cropped or a live stream to be joined without |
rewriting the granule position of all the remaining pages. |
This means that the PCM sample position just before the first sample to be |
- played may be larger than '0'. |
+ played MAY be larger than '0'. |
Synchronization when multiplexing with other logical streams still uses the PCM |
sample position relative to '0' to compute sample times. |
This does not affect the behavior of pre-skip: exactly 'pre-skip' samples |
- should be skipped from the beginning of the decoded output, even if the |
+ SHOULD be skipped from the beginning of the decoded output, even if the |
initial PCM sample position is greater than zero. |
</t> |
@@ -368,7 +484,7 @@ This does not affect the behavior of pre-skip: exactly 'pre-skip' samples |
On the other hand, a granule position that is smaller than the number of |
decoded samples prevents a demuxer from working backwards to assign each |
packet or each individual sample a valid granule position, since granule |
- positions must be non-negative. |
+ positions are non-negative. |
A decoder MUST reject as invalid any stream where the granule position is |
smaller than the number of samples contained in packets that complete on the |
first audio data page with a completed packet, unless that page has the 'end |
@@ -380,7 +496,7 @@ It MAY defer this action until it decodes the last packet completed on that |
<t> |
If that page has the 'end of stream' flag set, a demuxer MUST reject as invalid |
any stream where its granule position is smaller than the 'pre-skip' amount. |
-This would indicate that more samples should be skipped from the initial |
+This would indicate that there are more samples to be skipped from the initial |
decoded output than exist in the stream. |
If the granule position is smaller than the number of decoded samples produced |
by the packets that complete on that page, then a demuxer MUST use an initial |
@@ -414,8 +530,8 @@ This 'pre-roll' is separate from, and unrelated to, the 'pre-skip' used at the |
If the point 80 ms prior to the seek target comes before the initial PCM |
sample position, the decoder SHOULD start decoding from the beginning of the |
stream, applying pre-skip as normal, regardless of whether the pre-skip is |
- larger or smaller than 80 ms, and then continue to discard the samples |
- required to reach the seek target (if any). |
+ larger or smaller than 80 ms, and then continue to discard samples |
+ to reach the seek target (if any). |
</t> |
</section> |
@@ -518,9 +634,9 @@ When cropping the beginning of existing Ogg Opus streams, a pre-skip of at |
This field is <spanx style="emph">not</spanx> the sample rate to use for |
playback of the encoded data. |
<vspace blankLines="1"/> |
-Opus has a handful of coding modes, with internal audio bandwidths of 4, 6, 8, |
- 12, and 20 kHz. |
-Each packet in the stream may have a different audio bandwidth. |
+Opus can switch between internal audio bandwidths of 4, 6, 8, 12, and |
+ 20 kHz. |
+Each packet in the stream can have a different audio bandwidth. |
Regardless of the audio bandwidth, the reference decoder supports decoding any |
stream at a sample rate of 8, 12, 16, 24, or 48 kHz. |
The original sample rate of the encoder input is not preserved by the lossy |
@@ -533,12 +649,13 @@ An Ogg Opus player SHOULD select the playback sample rate according to the |
<t>Otherwise, if the hardware's highest available sample rate is a supported |
rate, decode at this sample rate.</t> |
<t>Otherwise, if the hardware's highest available sample rate is less than |
- 48 kHz, decode at the highest supported rate above this and resample.</t> |
+ 48 kHz, decode at the next highest supported rate above this and |
+ resample.</t> |
<t>Otherwise, decode at 48 kHz and resample.</t> |
</list> |
However, the 'Input Sample Rate' field allows the encoder to pass the sample |
rate of the original input stream as metadata. |
-This may be useful when the user requires the output sample rate to match the |
+This is useful when the user requires the output sample rate to match the |
input sample rate. |
For example, a non-player decoder writing PCM format samples to disk might |
choose to resample the output audio back to the original input sample rate to |
@@ -559,39 +676,42 @@ This is a gain to be applied by the decoder. |
It is 20*log10 of the factor to scale the decoder output by to achieve the |
desired playback volume, stored in a 16-bit, signed, two's complement |
fixed-point value with 8 fractional bits (i.e., Q7.8). |
-To apply the gain, a decoder could use |
<figure align="center"> |
+<preamble> |
+To apply the gain, a decoder could use |
+</preamble> |
<artwork align="center"><![CDATA[ |
sample *= pow(10, output_gain/(20.0*256)) , |
]]></artwork> |
-</figure> |
+<postamble> |
where output_gain is the raw 16-bit value from the header. |
+</postamble> |
+</figure> |
<vspace blankLines="1"/> |
-Virtually all players and media frameworks should apply it by default. |
+Virtually all players and media frameworks SHOULD apply it by default. |
If a player chooses to apply any volume adjustment or gain modification, such |
- as the R128_TRACK_GAIN (see <xref target="comment_header"/>) or a user-facing |
- volume knob, the adjustment MUST be applied in addition to this output gain in |
- order to achieve playback at the desired volume. |
+ as the R128_TRACK_GAIN (see <xref target="comment_header"/>), the adjustment |
+ MUST be applied in addition to this output gain in order to achieve playback |
+ at the normalized volume. |
<vspace blankLines="1"/> |
An encoder SHOULD set this field to zero, and instead apply any gain prior to |
encoding, when this is possible and does not conflict with the user's wishes. |
-The output gain should only be nonzero when the gain is adjusted after |
- encoding, or when the user wishes to adjust the gain for playback while |
- preserving the ability to recover the original signal amplitude. |
+A nonzero output gain indicates the gain was adjusted after encoding, or that |
+ a user wished to adjust the gain for playback while preserving the ability |
+ to recover the original signal amplitude. |
<vspace blankLines="1"/> |
Although the output gain has enormous range (+/- 128 dB, enough to amplify |
inaudible sounds to the threshold of physical pain), most applications can |
only reasonably use a small portion of this range around zero. |
The large range serves in part to ensure that gain can always be losslessly |
- transferred between OpusHead and R128_TRACK_GAIN (see below) without |
+ transferred between OpusHead and R128 gain tags (see below) without |
saturating. |
<vspace blankLines="1"/> |
</t> |
<t><spanx style="strong">Channel Mapping Family</spanx> (8 bits, |
unsigned): |
<vspace blankLines="1"/> |
-This octet indicates the order and semantic meaning of the various channels |
- encoded in each Ogg packet. |
+This octet indicates the order and semantic meaning of the output channels. |
<vspace blankLines="1"/> |
Each possible value of this octet indicates a mapping family, which defines a |
set of allowed channel counts, and the ordered set of channel names for each |
@@ -651,7 +771,7 @@ The fields in the channel mapping table have the following meaning: |
<t><spanx style="strong">Stream Count</spanx> 'N' (8 bits, unsigned): |
<vspace blankLines="1"/> |
This is the total number of streams encoded in each Ogg packet. |
-This value is required to correctly parse the packed Opus packets inside an |
+This value is necessary to correctly parse the packed Opus packets inside an |
Ogg packet, as described in <xref target="packet_organization"/>. |
This value MUST NOT be zero, as without at least one Opus packet with a valid |
TOC sequence, a demuxer cannot recover the duration of an Ogg packet. |
@@ -660,7 +780,7 @@ For channel mapping family 0, this value defaults to 1, and is not coded. |
<vspace blankLines="1"/> |
</t> |
<t><spanx style="strong">Coupled Stream Count</spanx> 'M' (8 bits, unsigned): |
-This is the number of streams whose decoders should be configured to produce |
+This is the number of streams whose decoders are to be configured to produce |
two channels. |
This MUST be no larger than the total number of streams, N. |
<vspace blankLines="1"/> |
@@ -675,8 +795,8 @@ Regardless of the internal channel count, any Opus stream can be decoded as |
mono (a single channel) or stereo (two channels) by appropriate initialization |
of the decoder. |
The 'coupled stream count' field indicates that the first M Opus decoders are |
- to be initialized in stereo mode, and the remaining N-M decoders are to be |
- initialized in mono mode. |
+ to be initialized for stereo output, and the remaining N-M decoders are to be |
+ initialized for mono only. |
The total number of decoded channels, (M+N), MUST be no larger than 255, as |
there is no way to index more channels than that in the channel mapping. |
<vspace blankLines="1"/> |
@@ -686,14 +806,14 @@ For channel mapping family 0, this value defaults to C-1 (i.e., 0 for mono |
</t> |
<t><spanx style="strong">Channel Mapping</spanx> (8*C bits): |
This contains one octet per output channel, indicating which decoded channel |
- should be used for each one. |
+ is to be used for each one. |
Let 'index' be the value of this octet for a particular output channel. |
This value MUST either be smaller than (M+N), or be the special value 255. |
If 'index' is less than 2*M, the output MUST be taken from decoding stream |
('index'/2) as stereo and selecting the left channel if 'index' is even, and |
the right channel if 'index' is odd. |
-If 'index' is 2*M or larger, the output MUST be taken from decoding stream |
- ('index'-M) as mono. |
+If 'index' is 2*M or larger, but less than 255, the output MUST be taken from |
+ decoding stream ('index'-M) as mono. |
If 'index' is 255, the corresponding output channel MUST contain pure silence. |
<vspace blankLines="1"/> |
The number of output channels, C, is not constrained to match the number of |
@@ -712,7 +832,7 @@ Neither index is coded. |
<t> |
After producing the output channels, the channel mapping family determines the |
semantic meaning of each one. |
-Currently there are three defined mapping families, although more may be added. |
+There are three defined mapping families in this specification. |
</t> |
<section anchor="channel_mapping_0" title="Channel Mapping Family 0"> |
@@ -742,7 +862,7 @@ Vorbis channel order. |
</t> |
<t> |
Each channel is assigned to a speaker location in a conventional surround |
- configuration. |
+ arrangement. |
Specific locations depend on the number of channels, and are given below |
in order of the corresponding channel indicies. |
<list style="symbols"> |
@@ -755,19 +875,20 @@ Specific locations depend on the number of channels, and are given below |
<t>7 channels: 6.1 surround (front left, front center, front right, side left, side right, rear center, LFE).</t> |
<t>8 channels: 7.1 surround (front left, front center, front right, side left, side right, rear left, rear right, LFE)</t> |
</list> |
-This set of surround configurations and speaker location orderings is the same |
- as the one used by the Vorbis codec <xref target="vorbis-mapping"/>. |
+</t> |
+<t> |
+This set of surround options and speaker location orderings is the same |
+ as those used by the Vorbis codec <xref target="vorbis-mapping"/>. |
The ordering is different from the one used by the |
WAVE <xref target="wave-multichannel"/> and |
FLAC <xref target="flac"/> formats, |
- so correct ordering requires permutation of the output channels when encoding |
- from or decoding to those formats. |
+ so correct ordering requires permutation of the output channels when decoding |
+ to or encoding from those formats. |
'LFE' here refers to a Low Frequency Effects, often mapped to a subwoofer |
- with no particular spacial position. |
+ with no particular spatial position. |
Implementations SHOULD identify 'side' or 'rear' speaker locations with |
'surround' and 'back' as appropriate when interfacing with audio formats |
or systems which prefer that terminology. |
-Speaker configurations other than those described here are not supported. |
</t> |
</section> |
@@ -811,7 +932,7 @@ Implementations MAY use the following matricies to implement downmixing from |
Family 1</xref>, which are known to give acceptable results for stereo. |
Matricies for 3 and 4 channels are normalized so each coefficent row sums |
to 1 to avoid clipping. |
-For 5 or more channels they are normalized to 2 as a compromize between |
+For 5 or more channels they are normalized to 2 as a compromise between |
clipping and dynamic range reduction. |
</t> |
<t> |
@@ -828,8 +949,8 @@ Rear channels are mixed more diffusely or attenuated to maintain focus |
title="Stereo downmix matrix for the linear surround channel mapping" |
align="center"> |
<artwork align="center"><![CDATA[ |
- Left output = ( 0.585786 * left + 0.414214 * center ) |
-Right output = ( 0.414214 * center + 0.585786 * right ) |
+L output = ( 0.585786 * left + 0.414214 * center ) |
+R output = ( 0.414214 * center + 0.585786 * right ) |
]]></artwork> |
<postamble> |
Exact coefficient values are 1 and 1/sqrt(2), multiplied by |
@@ -965,7 +1086,8 @@ The coeffients are in the same order as in <xref target="channel_mapping_1" />, |
<t> |
The comment header consists of a 64-bit magic signature, followed by data in |
the same format as the <xref target="vorbis-comment"/> header used in Ogg |
- Vorbis (without the final "framing bit"), Ogg Theora, and Speex. |
+ Vorbis, except (like Ogg Theora and Speex) the final "framing bit" specified |
+ in the Vorbis spec is not present. |
<list style="numbers"> |
<t><spanx style="strong">Magic Signature</spanx>: |
<vspace blankLines="1"/> |
@@ -998,7 +1120,7 @@ It MUST NOT indicate that the vendor string is longer than the rest of the |
<vspace blankLines="1"/> |
This is a simple human-readable tag for vendor information, encoded as a UTF-8 |
string <xref target="RFC3629"/>. |
-No terminating null octet is required. |
+No terminating null octet is necessary. |
<vspace blankLines="1"/> |
This tag is intended to identify the codec encoder and encapsulation |
implementations, for tracing differences in technical behavior. |
@@ -1041,64 +1163,103 @@ The vendor string length and user comment list length are REQUIRED, and |
for these fields, or that do not contain enough data for the corresponding |
vendor string or user comments they describe. |
Making this check before allocating the associated memory to contain the data |
- may help prevent a possible Denial-of-Service (DoS) attack from small comment |
+ helps prevent a possible Denial-of-Service (DoS) attack from small comment |
headers that claim to contain strings longer than the entire packet or more |
user comments than than could possibly fit in the packet. |
</t> |
<t> |
+Immediately following the user comment list, the comment header MAY |
+ contain zero-padding or other binary data which is not specified here. |
+If the least-significant bit of the first byte of this data is 1, then editors |
+ SHOULD preserve the contents of this data when updating the tags, but if this |
+ bit is 0, all such data MAY be treated as padding, and truncated or discarded |
+ as desired. |
+</t> |
+ |
+<section anchor="comment_format" title="Tag Definitions"> |
+<t> |
The user comment strings follow the NAME=value format described by |
- <xref target="vorbis-comment"/> with the same recommended tag names. |
-One new comment tag is introduced for Ogg Opus: |
+ <xref target="vorbis-comment"/> with the same recommended tag names: |
+ ARTIST, TITLE, DATE, ALBUM, and so on. |
+</t> |
+<t> |
+Two new comment tags are introduced here: |
+</t> |
+ |
<figure align="center"> |
+ <preamble>An optional gain for track nomalization</preamble> |
<artwork align="left"><![CDATA[ |
R128_TRACK_GAIN=-573 |
]]></artwork> |
-</figure> |
-representing the volume shift needed to normalize the track's volume. |
+<postamble> |
+representing the volume shift needed to normalize the track's volume |
+ during isolated playback, in random shuffle, and so on. |
The gain is a Q7.8 fixed point number in dB, as in the ID header's 'output |
gain' field. |
+</postamble> |
+</figure> |
+<t> |
This tag is similar to the REPLAYGAIN_TRACK_GAIN tag in |
Vorbis <xref target="replay-gain"/>, except that the normal volume |
reference is the <xref target="EBU-R128"/> standard. |
</t> |
+<figure align="center"> |
+ <preamble>An optional gain for album nomalization</preamble> |
+<artwork align="left"><![CDATA[ |
+R128_ALBUM_GAIN=111 |
+]]></artwork> |
+<postamble> |
+representing the volume shift needed to normalize the overall volume when |
+ played as part of a particular collection of tracks. |
+The gain is also a Q7.8 fixed point number in dB, as in the ID header's |
+ 'output gain' field. |
+</postamble> |
+</figure> |
<t> |
-An Ogg Opus file MUST NOT have more than one such tag, and if present its |
- value MUST be an integer from -32768 to 32767, inclusive, represented in |
- ASCII with no whitespace. |
-If present, it MUST correctly represent the R128 normalization gain relative |
- to the 'output gain' field specified in the ID header. |
-If a player chooses to make use of the R128_TRACK_GAIN tag, it MUST be |
- applied <spanx style="emph">in addition</spanx> to the 'output gain' value. |
-If an encoder wishes to use R128 normalization, and the output gain is not |
- otherwise constrained or specified, the encoder SHOULD write the R128 gain |
- into the 'output gain' field and store a tag containing "R128_TRACK_GAIN=0". |
-That is, it should assume that by default tools will respect the 'output gain' |
- field, and not the comment tag. |
+An Ogg Opus stream MUST NOT have more than one of each tag, and if present |
+ their values MUST be an integer from -32768 to 32767, inclusive, |
+ represented in ASCII as a base 10 number with no whitespace. |
+A leading '+' or '-' character is valid. |
+Leading zeros are also permitted, but the value MUST be represented by |
+ no more than 6 characters. |
+Other non-digit characters MUST NOT be present. |
+</t> |
+<t> |
+If present, R128_TRACK_GAIN and R128_ALBUM_GAIN MUST correctly represent |
+ the R128 normalization gain relative to the 'output gain' field specified |
+ in the ID header. |
+If a player chooses to make use of the R128_TRACK_GAIN tag or the |
+ R128_ALBUM_GAIN tag, it MUST apply those gains |
+ <spanx style="emph">in addition</spanx> to the 'output gain' value. |
If a tool modifies the ID header's 'output gain' field, it MUST also update or |
- remove the R128_TRACK_GAIN comment tag. |
+ remove the R128_TRACK_GAIN and R128_ALBUM_GAIN comment tags if present. |
+An encoder SHOULD assume that by default tools will respect the 'output gain' |
+ field, and not the comment tag. |
</t> |
<t> |
To avoid confusion with multiple normalization schemes, an Opus comment header |
SHOULD NOT contain any of the REPLAYGAIN_TRACK_GAIN, REPLAYGAIN_TRACK_PEAK, |
REPLAYGAIN_ALBUM_GAIN, or REPLAYGAIN_ALBUM_PEAK tags. |
+<xref target="EBU-R128"/> normalization is preferred to the earlier |
+ REPLAYGAIN schemes because of its clear definition and adoption by industry. |
+Peak normalizations are difficult to calculate reliably for lossy codecs |
+ because of variation in excursion heights due to decoder differences. |
+In the authors' investigations they were not applied consistently or broadly |
+ enough to merit inclusion here. |
</t> |
-<t> |
-There is no Opus comment tag corresponding to REPLAYGAIN_ALBUM_GAIN. |
-That information should instead be stored in the ID header's 'output gain' |
- field. |
-</t> |
-</section> |
+</section> <!-- end comment_format --> |
+</section> <!-- end comment_header --> |
-</section> |
+</section> <!-- end headers --> |
<section anchor="packet_size_limits" title="Packet Size Limits"> |
<t> |
-Technically valid Opus packets can be arbitrarily large due to the padding |
+Technically, valid Opus packets can be arbitrarily large due to the padding |
format, although the amount of non-padding data they can contain is bounded. |
These packets might be spread over a similarly enormous number of Ogg pages. |
-Encoders SHOULD use no more padding than required to make a variable bitrate |
- (VBR) stream constant bitrate (CBR). |
+Encoders SHOULD use no more padding than is necessary to make a variable |
+ bitrate (VBR) stream constant bitrate (CBR). |
Decoders SHOULD avoid attempting to allocate excessive amounts of memory when |
presented with a very large packet. |
The presence of an extremely large packet in the stream could indicate a |
@@ -1122,11 +1283,11 @@ Even in such a packet, most of the data will be zeros as 2.5 ms frames |
The largest packet consisting of entirely useful data is |
(15,326*N - 2) octets, or about 15 kB per stream. |
This corresponds to 120 ms of audio encoded as 10 ms frames in either |
- LP or Hybrid mode, but at a data rate of over 1 Mbps, which makes little |
+ SILK or Hybrid mode, but at a data rate of over 1 Mbps, which makes little |
sense for the quality achieved. |
A more reasonable limit is (7,664*N - 2) octets, or about 7.5 kB |
per stream. |
-This corresponds to 120 ms of audio encoded as 20 ms stereo MDCT-mode |
+This corresponds to 120 ms of audio encoded as 20 ms stereo CELT mode |
frames, with a total bitrate just under 511 kbps (not counting the Ogg |
encapsulation overhead). |
With N=8, the maximum number of channels currently defined by mapping |
@@ -1141,7 +1302,7 @@ An implementation could reasonably choose any of these numbers for its internal |
<section anchor="encoder" title="Encoder Guidelines"> |
<t> |
-When encoding Opus files, Ogg encoders should take into account the |
+When encoding Opus streams, Ogg muxers SHOULD take into account the |
algorithmic delay of the Opus encoder. |
</t> |
<figure align="center"> |
@@ -1150,18 +1311,19 @@ In encoders derived from the reference implementation, the number of |
samples can be queried with: |
</preamble> |
<artwork align="center"><![CDATA[ |
- opus_encoder_ctl(encoder_state, OPUS_GET_LOOKAHEAD, &samples_delay); |
+ opus_encoder_ctl(encoder_state, OPUS_GET_LOOKAHEAD(&delay_samples)); |
]]></artwork> |
</figure> |
<t> |
To achieve good quality in the very first samples of a stream, the Ogg encoder |
- MAY use LPC extrapolation to generate at least 120 extra samples |
- (extra_samples) at the beginning to avoid the Opus encoder having to encode |
- a discontinuous signal. |
-For an input file containing length samples, the Ogg encoder SHOULD set the |
- preskip header flag to samples_delay+extra_samples, encode at least |
- length+samples_delay+extra_samples samples, and set the granulepos of the last |
- page to length+samples_delay+extra_samples. |
+ MAY use linear predictive coding (LPC) extrapolation |
+ <xref target="linear-prediction"/> to generate at least 120 extra samples at |
+ the beginning to avoid the Opus encoder having to encode a discontinuous |
+ signal. |
+For an input file containing 'length' samples, the Ogg encoder SHOULD set the |
+ pre-skip header value to delay_samples+extra_samples, encode at least |
+ length+delay_samples+extra_samples samples, and set the granulepos of the last |
+ page to length+delay_samples+extra_samples. |
This ensures that the encoded file has the same duration as the original, with |
no time offset. The best way to pad the end of the stream is to also use LPC |
extrapolation, but zero-padding is also acceptable. |
@@ -1170,7 +1332,7 @@ This ensures that the encoded file has the same duration as the original, with |
<section anchor="lpc" title="LPC Extrapolation"> |
<t> |
The first step in LPC extrapolation is to compute linear prediction |
- coefficients. |
+ coefficients. <xref target="lpc-sample"/> |
When extending the end of the signal, order-N (typically with N ranging from 8 |
to 40) LPC analysis is performed on a window near the end of the signal. |
The last N samples are used as memory to an infinite impulse response (IIR) |
@@ -1205,7 +1367,7 @@ When extending the beginning of the signal, it is best to apply a "fade in" to |
<section anchor="continuous_chaining" title="Continuous Chaining"> |
<t> |
In some applications, such as Internet radio, it is desirable to cut a long |
- streams into smaller chains, e.g. so the comment header can be updated. |
+ stream into smaller chains, e.g. so the comment header can be updated. |
This can be done simply by separating the input streams into segments and |
encoding each segment independently. |
The drawback of this approach is that it creates a small discontinuity |
@@ -1219,12 +1381,26 @@ De-emphasis is allowed.</t> |
frame.</t> |
<t>Begin the second segment with a copy of the last frame of the first |
segment.</t> |
-<t>Set the preskip flag of the second stream in such a way as to properly |
+<t>Set the pre-skip value of the second stream in such a way as to properly |
join the two streams.</t> |
<t>Continue the encoding process normally from there, without any reset to |
the encoder.</t> |
</list> |
</t> |
+<figure align="center"> |
+<preamble> |
+In encoders derived from the reference implementation, inter-frame prediction |
+ can be turned off by calling: |
+</preamble> |
+<artwork align="center"><![CDATA[ |
+ opus_encoder_ctl(encoder_state, OPUS_SET_PREDICTION_DISABLED(1)); |
+]]></artwork> |
+<postamble> |
+For best results, this implementation requires that prediction be explicitly |
+ enabled again before resuming normal encoding, even after a reset. |
+</postamble> |
+</figure> |
+ |
</section> |
</section> |
@@ -1237,7 +1413,7 @@ A brief summary of major implementations of this draft is available |
</t> |
<t> |
[Note to RFC Editor: please remove this entire section before |
- final publication per <xref target="draft-sheffer-running-code"/>.] |
+ final publication per <xref target="RFC6982"/>.] |
</t> |
</section> |
@@ -1248,16 +1424,16 @@ Implementations of the Opus codec need to take appropriate security |
This is just as much a problem for the container as it is for the codec itself. |
It is extremely important for the decoder to be robust against malicious |
payloads. |
-Malicious payloads must not cause the decoder to overrun its allocated memory |
+Malicious payloads MUST NOT cause the decoder to overrun its allocated memory |
or to take an excessive amount of resources to decode. |
Although problems in encoders are typically rarer, the same applies to the |
encoder. |
-Malicious audio streams must not cause the encoder to misbehave because this |
+Malicious audio streams MUST NOT cause the encoder to misbehave because this |
would allow an attacker to attack transcoding gateways. |
</t> |
<t> |
-Like most other container formats, Ogg Opus files should not be used with |
+Like most other container formats, Ogg Opus streams SHOULD NOT be used with |
insecure ciphers or cipher modes that are vulnerable to known-plaintext |
attacks. |
Elements such as the Ogg page capture pattern and the magic signatures in the |
@@ -1336,16 +1512,18 @@ The authors agree to grant third parties the irrevocable right to copy, use, |
&rfc6381; |
&rfc6716; |
-<reference anchor="EBU-R128" target="http://tech.ebu.ch/loudness"> |
+<reference anchor="EBU-R128" target="https://tech.ebu.ch/loudness"> |
<front> |
-<title>"Loudness Recommendation EBU R128</title> |
-<author fullname="EBU Technical Committee"/> |
-<date month="August" year="2011"/> |
+ <title>Loudness Recommendation EBU R128</title> |
+ <author> |
+ <organization>EBU Technical Committee</organization> |
+ </author> |
+ <date month="August" year="2011"/> |
</front> |
</reference> |
<reference anchor="vorbis-comment" |
- target="http://www.xiph.org/vorbis/doc/v-comment.html"> |
+ target="https://www.xiph.org/vorbis/doc/v-comment.html"> |
<front> |
<title>Ogg Vorbis I Format Specification: Comment Field and Header |
Specification</title> |
@@ -1361,16 +1539,7 @@ The authors agree to grant third parties the irrevocable right to copy, use, |
<!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml"?--> |
&rfc4732; |
- |
-<reference anchor="draft-sheffer-running-code" |
- target="https://tools.ietf.org/html/draft-sheffer-running-code-05#section-2"> |
- <front> |
- <title>Improving "Rough Consensus" with Running Code</title> |
- <author initials="Y." surname="Sheffer" fullname="Yaron Sheffer"/> |
- <author initials="A." surname="Farrel" fullname="Adrian Farrel"/> |
- <date month="May" year="2013"/> |
- </front> |
-</reference> |
+ &rfc6982; |
<reference anchor="flac" |
target="https://xiph.org/flac/format.html"> |
@@ -1382,16 +1551,41 @@ The authors agree to grant third parties the irrevocable right to copy, use, |
</reference> |
<reference anchor="hanning" |
- target="http://en.wikipedia.org/wiki/Hamming_function#Hann_.28Hanning.29_window"> |
+ target="https://en.wikipedia.org/wiki/Hamming_function#Hann_.28Hanning.29_window"> |
<front> |
- <title>"Hann window</title> |
- <author fullname="Wikipedia"/> |
+ <title>Hann window</title> |
+ <author> |
+ <organization>Wikipedia</organization> |
+ </author> |
<date month="May" year="2013"/> |
</front> |
</reference> |
+<reference anchor="linear-prediction" |
+ target="https://en.wikipedia.org/wiki/Linear_predictive_coding"> |
+ <front> |
+ <title>Linear Predictive Coding</title> |
+ <author> |
+ <organization>Wikipedia</organization> |
+ </author> |
+ <date month="January" year="2014"/> |
+ </front> |
+</reference> |
+ |
+<reference anchor="lpc-sample" |
+ target="https://svn.xiph.org/trunk/vorbis/lib/lpc.c"> |
+<front> |
+ <title>Autocorrelation LPC coeff generation algorithm |
+ (Vorbis source code)</title> |
+<author initials="J." surname="Degener" fullname="Jutta Degener"/> |
+<author initials="C." surname="Bormann" fullname="Carsten Bormann"/> |
+<date month="November" year="1994"/> |
+</front> |
+</reference> |
+ |
+ |
<reference anchor="replay-gain" |
- target="http://wiki.xiph.org/VorbisComment#Replay_Gain"> |
+ target="https://wiki.xiph.org/VorbisComment#Replay_Gain"> |
<front> |
<title>VorbisComment: Replay Gain</title> |
<author initials="C." surname="Parker" fullname="Conrad Parker"/> |
@@ -1401,7 +1595,7 @@ The authors agree to grant third parties the irrevocable right to copy, use, |
</reference> |
<reference anchor="seeking" |
- target="http://wiki.xiph.org/Seeking"> |
+ target="https://wiki.xiph.org/Seeking"> |
<front> |
<title>Granulepos Encoding and How Seeking Really Works</title> |
<author initials="S." surname="Pfeiffer" fullname="Silvia Pfeiffer"/> |
@@ -1412,7 +1606,7 @@ The authors agree to grant third parties the irrevocable right to copy, use, |
</reference> |
<reference anchor="vorbis-mapping" |
- target="http://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-800004.3.9"> |
+ target="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-800004.3.9"> |
<front> |
<title>The Vorbis I Specification, Section 4.3.9 Output Channel Order</title> |
<author initials="C." surname="Montgomery" |
@@ -1422,7 +1616,7 @@ The authors agree to grant third parties the irrevocable right to copy, use, |
</reference> |
<reference anchor="vorbis-trim" |
- target="http://xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-130000A.2"> |
+ target="https://xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-130000A.2"> |
<front> |
<title>The Vorbis I Specification, Appendix A: Embedding Vorbis |
into an Ogg stream</title> |
@@ -1436,7 +1630,9 @@ The authors agree to grant third parties the irrevocable right to copy, use, |
target="http://msdn.microsoft.com/en-us/windows/hardware/gg463006.aspx"> |
<front> |
<title>Multiple Channel Audio Data and WAVE Files</title> |
- <author fullname="Microsoft Corporation"/> |
+ <author> |
+ <organization>Microsoft Corporation</organization> |
+ </author> |
<date month="March" year="2007"/> |
</front> |
</reference> |