From 23f05ccc6bfb1dd72851495aceebfae6da08654a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 4 Jul 2016 16:13:11 +0200 Subject: [PATCH] updated specifications --- lib/decompress/zstd_decompress.c | 22 +-- zstd_compression_format.md | 323 +++++++++++++++++++++++++++---- 2 files changed, 298 insertions(+), 47 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 001a19ae..22820582 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -195,7 +195,7 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) /* Frame format description Frame Header - [ Block Header - Block ] - Frame End 1) Frame Header - - 4 bytes - Magic Number : ZSTD_MAGICNUMBER (defined within zstd_static.h) + - 4 bytes - Magic Number : ZSTD_MAGICNUMBER (defined within zstd.h) - 1 byte - Frame Descriptor 2) Block Header - 3 bytes, starting with a 2-bits descriptor @@ -629,7 +629,7 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeqPtr, /* FSE table descriptors */ { U32 const LLtype = *ip >> 6; - U32 const Offtype = (*ip >> 4) & 3; + U32 const OFtype = (*ip >> 4) & 3; U32 const MLtype = (*ip >> 2) & 3; ip++; @@ -637,17 +637,17 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeqPtr, if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */ /* Build DTables */ - { size_t const bhSize = ZSTD_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable); - if (ZSTD_isError(bhSize)) return ERROR(corruption_detected); - ip += bhSize; + { size_t const llhSize = ZSTD_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable); + if (ZSTD_isError(llhSize)) return ERROR(corruption_detected); + ip += llhSize; } - { size_t const bhSize = ZSTD_buildSeqTable(DTableOffb, Offtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable); - if (ZSTD_isError(bhSize)) return ERROR(corruption_detected); - ip += bhSize; + { size_t const ofhSize = ZSTD_buildSeqTable(DTableOffb, OFtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable); + if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected); + ip += ofhSize; } - { size_t const bhSize = ZSTD_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable); - if (ZSTD_isError(bhSize)) return ERROR(corruption_detected); - ip += bhSize; + { size_t const mlhSize = ZSTD_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable); + if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected); + ip += mlhSize; } } return ip-istart; diff --git a/zstd_compression_format.md b/zstd_compression_format.md index a0130501..dbadac75 100644 --- a/zstd_compression_format.md +++ b/zstd_compression_format.md @@ -134,9 +134,9 @@ delivering the final decompressed result as if it was a single content. Frame Header ------------- -| FHD | (WD) | (Content Size) | (dictID) | -| ------- | --------- |:--------------:| --------- | -| 1 byte | 0-1 byte | 0 - 8 bytes | 0-4 bytes | +| FHD | (WD) | (dictID) | (Content Size) | +| ------- | --------- | --------- |:--------------:| +| 1 byte | 0-1 byte | 0-4 bytes | 0 - 8 bytes | Frame header has a variable size, which uses a minimum of 2 bytes, and up to 14 bytes depending on optional parameters. @@ -145,11 +145,11 @@ __FHD byte__ (Frame Header Descriptor) The first Header's byte is called the Frame Header Descriptor. It tells which other fields are present. -Decoding this byte is enough to get the full size of the Frame Header. +Decoding this byte is enough to tell the size of Frame Header. -| BitNb | 7-6 | 5 | 4 | 3 | 2 | 1-0 | -| ------- | ------ | ------- | ------ | -------- | -------- | -------- | -|FieldName| FCSize | Segment | Unused | Reserved | Checksum | dictID | +| BitNb | 7-6 | 5 | 4 | 3 | 2 | 1-0 | +| ------- | ------ | ------- | ------ | -------- | -------- | ------ | +|FieldName| FCSize | Segment | Unused | Reserved | Checksum | dictID | In this table, bit 7 is highest bit, while bit 0 is lowest. @@ -162,28 +162,28 @@ specifying if decompressed data size is provided within the header. | ------- | --- | --- | --- | --- | |FieldSize| 0-1 | 2 | 4 | 8 | -Value 0 has a double meaning : +Value 0 meaning depends on _single segment_ mode : it either means `0` (size not provided) _if_ the `WD` byte is present, -or it means `1` byte (size <= 255 bytes). +or `1` (frame content size <= 255 bytes) otherwise. __Single Segment__ If this flag is set, data shall be regenerated within a single continuous memory segment. + In which case, `WD` byte __is not present__, but `Frame Content Size` field necessarily is. - As a consequence, the decoder must allocate a memory segment of size `>= Frame Content Size`. In order to preserve the decoder from unreasonable memory requirement, -a decoder can refuse a compressed frame +a decoder can reject a compressed frame which requests a memory size beyond decoder's authorized range. For broader compatibility, decoders are recommended to support -memory sizes of 8 MB at least. -However, this is merely a recommendation, -and each decoder is free to support higher or lower limits, +memory sizes of at least 8 MB. +This is just a recommendation, +as each decoder is free to support higher or lower limits, depending on local limitations. __Unused bit__ @@ -254,6 +254,21 @@ It's merely a recommendation though, decoders are free to support larger or lower limits, depending on local limitations. +__Dictionary ID__ + +This is a variable size field, which contains an ID. +It checks if the correct dictionary is used for decoding. +Note that this field is optional. If it's not present, +it's up to the caller to make sure it uses the correct dictionary. + +Field size depends on __Dictionary ID flag__. +1 byte can represent an ID 0-255. +2 bytes can represent an ID 0-65535. +4 bytes can represent an ID 0-(2^32-1). + +It's allowed to represent a small ID (for example `13`) +with a large 4-bytes dictionary ID, losing some efficiency in the process. + __Frame Content Size__ This is the original (uncompressed) size. @@ -274,27 +289,12 @@ When field size is 2, _an offset of 256 is added_. It's allowed to represent a small size (ex: `18`) using the 8-bytes variant. A size of `0` means `content size is unknown`. In which case, the `WD` byte will necessarily be present, -and becomes the only hint to determine memory allocation. +and becomes the only hint to help memory allocation. In order to preserve decoder from unreasonable memory requirement, a decoder can refuse a compressed frame which requests a memory size beyond decoder's authorized range. -__Dictionary ID__ - -This is a variable size field, which contains a single ID. -It checks if the correct dictionary is used for decoding. -Note that this field is optional. If it's not present, -it's up to the caller to make sure it uses the correct dictionary. - -Field size depends on __Dictionary ID flag__. -1 byte can represent an ID 0-255. -2 bytes can represent an ID 0-65535. -4 bytes can represent an ID 0-(2^32-1). - -It's allowed to represent a small ID (for example `13`) -with a large 4-bytes dictionary ID, losing some efficiency in the process. - Data Blocks ----------- @@ -364,7 +364,6 @@ over user-defined data and continue decoding. Skippable frames defined in this specification are compatible with LZ4 ones. - __Magic Number__ : 4 Bytes, Little endian format. @@ -395,8 +394,8 @@ A compressed block consists of 2 sections : - Literals section - Sequences section -### Prerequisite -To decode a compressed block, it's required to access to following elements : +### Prerequisites +To decode a compressed block, the following elements are necessary : - Previous decoded blocks, up to a distance of `windowSize`, or all frame's previous blocks in "single segment" mode. - List of "recent offsets" from previous compressed block. @@ -634,7 +633,6 @@ it gives the following distribution : | nb bits | 0 | 4 | 4 | 3 | 2 | 1 | - #### Literals bitstreams ##### Bitstreams sizes @@ -711,12 +709,265 @@ which specifies a baseline and a number of additional bits. _Codes_ are FSE compressed, and interleaved with raw additional bits in the same bitstream. -The Sequence section starts by a header, -followed by an optional Probability table for each symbol type, +The Sequences section starts by a header, +followed by optional Probability tables for each symbol type, followed by the bitstream. +To decode the Sequence section, it's required to know its size. +This size is deducted from "blockSize - literalSectionSize". + + #### Sequences section header +Consists in 2 items : +- Nb of Sequences +- Flags providing Symbol compression types + +__Nb of Sequences__ + +This is a variable size field, `nbSeqs`, using between 1 and 3 bytes. +Let's call its first byte `byte0`. +- `if (byte0 == 0)` : there are no sequences. + The sequence section stops there. + Regenerated content is defined entirely by literals section. +- `if (byte0 < 128)` : nbSeqs = byte0 . Uses 1 byte. +- `if (byte0 < 255)` : nbSeqs = ((byte0-128) << 8) + byte1 . Uses 2 bytes. +- `if (byte0 == 255)`: nbSeqs = byte1 + (byte2<<8) + 0x7F00 . Uses 3 bytes. + +__Symbol compression modes__ + +This is a single byte, defining the compression mode of each symbol type. + +| BitNb | 7-6 | 5-4 | 3-2 | 1-0 | +| ------- | ------ | ------ | ------ | -------- | +|FieldName| LLtype | OFType | MLType | Reserved | + +The last field, `Reserved`, must be all-zeroes. + +`LLtype`, `OFType` and `MLType` define the compression mode of +Literal Lengths, Offsets and Match Lengths respectively. + +They follow the same enumeration : + +| Value | 0 | 1 | 2 | 3 | +| ---------------- | ------ | --- | ------ | --- | +| Compression Mode | predef | RLE | Repeat | FSE | + +- "predef" : uses a pre-defined distribution table. +- "RLE" : it's a single code, repeated `nbSeqs` times. +- "Repeat" : re-use distribution table from previous compressed block. +- "FSE" : standard FSE compression. + Symbol type requires a distribution table, + which will be described in next part. + +#### Symbols decoding + +##### Literal Lengths codes + +Literal lengths codes are values ranging from `0` to `35` included. +They define lengths from 0 to 131071 bytes. + +| Code | 0-15 | +| ------ | ---- | +| nbBits | 0 | +| value | Code | + +| Code | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | +| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| Baseline | 16 | 18 | 20 | 22 | 24 | 28 | 32 | 40 | +| nb Bits | 1 | 1 | 1 | 1 | 2 | 2 | 3 | 3 | + +| Code | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | +| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| Baseline | 48 | 64 | 128 | 256 | 512 | 1024 | 2048 | 4096 | +| nb Bits | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | + +| Code | 32 | 33 | 34 | 35 | +| -------- | ---- | ---- | ---- | ---- | +| Baseline | 8192 |16384 |32768 |65536 | +| nb Bits | 13 | 14 | 15 | 16 | + +__Default distribution__ + +When "compression mode" is defined as "default distribution", +a pre-defined distribution is used for FSE compression. + +Here is its definition. It uses an accuracy of 6 bits (64 states). +``` +short literalLengths_defaultDistribution[36] = + { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, + -1,-1,-1,-1 }; +``` + +##### Match Lengths codes + +Match lengths codes are values ranging from `0` to `52` included. +They define lengths from 3 to 131074 bytes. + +| Code | 0-31 | +| ------ | -------- | +| nbBits | 0 | +| value | Code + 3 | + +| Code | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | +| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| Baseline | 35 | 37 | 39 | 41 | 43 | 47 | 51 | 59 | +| nb Bits | 1 | 1 | 1 | 1 | 2 | 2 | 3 | 3 | + +| Code | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | +| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| Baseline | 67 | 83 | 99 | 131 | 258 | 514 | 1026 | 2050 | +| nb Bits | 4 | 4 | 5 | 7 | 8 | 9 | 10 | 11 | + +| Code | 48 | 49 | 50 | 51 | 52 | +| -------- | ---- | ---- | ---- | ---- | ---- | +| Baseline | 4098 | 8194 |16486 |32770 |65538 | +| nb Bits | 12 | 13 | 14 | 15 | 16 | + +__Default distribution__ + +When "compression mode" is defined as "default distribution", +a pre-defined distribution is used for FSE compression. + +Here is its definition. It uses an accuracy of 6 bits (64 states). +``` +short matchLengths_defaultDistribution[53] = + { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1, + -1,-1,-1,-1,-1 }; +``` + +##### Offset codes + +Offset codes are values ranging from `0` to `N`, +with `N` being limited by maximum backreference distance. + +A decoder is free to limit its maximum `N` supported, +although the recommendation is to support at least up to `22`. +For information, at the time of this writing. +the reference decoder supports a maximum `N` value of `28` in 64-bits mode. + +An offset code is also the nb of additional bits to read, +and can be translated into an `OFValue` using the following formulae : + +``` +OFValue = (1 << offsetCode) + readNBits(offsetCode); +if (OFValue > 3) offset = OFValue - 3; +``` + +OFValue from 1 to 3 are special : they define "repeat codes", +which means one of the previous offsets will be repeated. +They are sorted in recency order, with 1 meaning the most recent one. + +__Default distribution__ + +When "compression mode" is defined as "default distribution", +a pre-defined distribution is used for FSE compression. + +Here is its definition. It uses an accuracy of 5 bits (32 states), +and support a maximum `N` of 28, allowing offset values up to 536,870,908 . + +If any sequence in the compressed block requires an offset larger than this, +it's not possible to use the default distribution to represent it. + +``` +short offsetCodes_defaultDistribution[53] = + { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 }; +``` + +#### Distribution tables + +Following the header, up to 3 distribution tables can be described. +They are, in order : +- Literal lengthes +- Offsets +- Match Lengthes + +The content to decode depends on their respective compression mode : +- Repeat mode : no content. Re-use distribution from previous compressed block. +- Predef : no content. Use pre-defined distribution table. +- RLE : 1 byte. This is the only code to use across the whole compressed block. +- FSE : A distribution table is present. + +##### FSE distribution table : condensed format + +An FSE distribution table describes the probabilities of all symbols +from `0` to the last present one (included) +on a normalized scale of `2^AccuracyLog` . + +It's a bitstream which is read forward, in little-endian fashion. +It's not necessary to know its exact size, +since it will be discovered and reported by the decoding process. + +The bitstream starts by reporting on which scale it operates. +`AccuracyLog = low4bits + 5;` +In theory, it can define a scale from 5 to 20. +In practice, decoders are allowed to limit the maximum supported `AccuracyLog`. +Recommended maximum are `9` for literal and match lengthes, and `8` for offsets. +The reference decoder uses these limits. + +Then follow each symbol value, from `0` to last present one. +The nb of bits used by each field is variable. +It depends on : + +- Remaining probabilities + 1 : + __example__ : + Presuming an AccuracyLog of 8, + and presuming 100 probabilities points have already been distributed, + the decoder may discover value from `0` to `255 - 100 + 1 == 156` (included). + Therefore, it must read `log2sup(156) == 8` bits. + +- Value decoded : small values use 1 less bit : + __example__ : + Presuming values from 0 to 156 (included) are possible, + 255-156 = 99 values are remaining in an 8-bits field. + They are used this way : + first 99 values (hence from 0 to 98) use only 7 bits, + values from 99 to 156 use 8 bits. + This is achieved through this scheme : + + | Value read | Value decoded | nb Bits used | + | ---------- | ------------- | ------------ | + | 0 - 98 | 0 - 98 | 7 | + | 99 - 127 | 99 - 127 | 8 | + | 128 - 226 | 0 - 98 | 7 | + | 227 - 255 | 128 - 156 | 8 | + +Symbols probabilities are read one by one, in order. + +Probability is obtained from Value decoded by following formulae : +`Proba = value - 1;` + +It means value `0` becomes negative probability `-1`. +`-1` is a special probability, which means `less than 1`. +Its effect on distribution table is described in a later paragraph. +For the purpose of calculating cumulated distribution, it counts as one. + +When a symbol has a probability of `zero`, +it is followed by a 2-bits repeat flag. +This repeat flag tells how many probabilities of zeroes follow the current one. +It provides a number ranging from 0 to 3. +If it is a 3, another 2-bits repeat flag follows, and so on. + +When last symbol reaches cumulated total of `2^AccuracyLog`, +decoding is complete. +Then the decoder can tell how many bytes were used in this process, +and how many symbols are present. + +The bitstream consumes a round number of bytes. +Any remaining bit within the last byte is just unused. + +If the last symbol makes cumulated total go above `2^AccuracyLog`, +distribution is considered corrupted. + +##### FSE decoding : from normalized distribution to decoding tables + + + +#### Bitstream