Merge pull request #2381 from senhuang42/expand_sequence_extraction_api

Add enum to define ZSTD_Sequence type and update sequence extraction API
This commit is contained in:
sen 2020-11-06 13:00:31 -05:00 committed by GitHub
commit f62edf0fe9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 62 additions and 16 deletions

View File

@ -2505,6 +2505,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
for (i = 0; i < seqStoreSeqSize; ++i) { for (i = 0; i < seqStoreSeqSize; ++i) {
outSeqs[i].litLength = seqStoreSeqs[i].litLength; outSeqs[i].litLength = seqStoreSeqs[i].litLength;
outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH; outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH;
outSeqs[i].rep = 0;
if (i == seqStore->longLengthPos) { if (i == seqStore->longLengthPos) {
if (seqStore->longLengthID == 1) { if (seqStore->longLengthID == 1) {
@ -2549,8 +2550,8 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
zc->seqCollector.seqIndex += seqStoreSeqSize; zc->seqCollector.seqIndex += seqStoreSeqSize;
} }
size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
size_t outSeqsSize, const void* src, size_t srcSize) size_t outSeqsSize, const void* src, size_t srcSize)
{ {
const size_t dstCapacity = ZSTD_compressBound(srcSize); const size_t dstCapacity = ZSTD_compressBound(srcSize);
void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
@ -2569,6 +2570,22 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
return zc->seqCollector.seqIndex; return zc->seqCollector.seqIndex;
} }
size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) {
size_t in = 0;
size_t out = 0;
for (; in < seqsSize; ++in) {
if (sequences[in].offset == 0 && sequences[in].matchLength == 0) {
if (in != seqsSize - 1) {
sequences[in+1].litLength += sequences[in].litLength;
}
} else {
sequences[out] = sequences[in];
++out;
}
}
return out;
}
/* Returns true if the given block is a RLE block */ /* Returns true if the given block is a RLE block */
static int ZSTD_isRLE(const BYTE *ip, size_t length) { static int ZSTD_isRLE(const BYTE *ip, size_t length) {
size_t i; size_t i;

View File

@ -1149,7 +1149,7 @@ typedef struct {
* rep == 2 --> offset == repeat_offset_3 * rep == 2 --> offset == repeat_offset_3
* rep == 3 --> offset == repeat_offset_1 - 1 * rep == 3 --> offset == repeat_offset_1 - 1
* *
* Note: This field is optional. ZSTD_getSequences() will calculate the value of * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
* 'rep', but repeat offsets do not necessarily need to be calculated from an external * 'rep', but repeat offsets do not necessarily need to be calculated from an external
* sequence provider's perspective. * sequence provider's perspective.
*/ */
@ -1297,17 +1297,36 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS
* or an error code (if srcSize is too small) */ * or an error code (if srcSize is too small) */
ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
/*! ZSTD_getSequences() : typedef enum {
* Extract sequences from the sequence store. ZSTD_sf_explicitBlockDelimiters, /* Representation of ZSTD_Sequence contains explicit block delimiters */
* Each block will end with a dummy sequence with offset == 0, matchLength == 0, and litLength == length of last literals. ZSTD_sf_noBlockDelimiters /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
} ZSTD_sequenceFormat_e;
/*! ZSTD_generateSequences() :
* Generate sequences using ZSTD_compress2, given a source buffer.
*
* Each block will end with a dummy sequence
* with offset == 0, matchLength == 0, and litLength == length of last literals.
* litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
* simply acts as a block delimiter.
* *
* zc can be used to insert custom compression params. * zc can be used to insert custom compression params.
* This function invokes ZSTD_compress2 * This function invokes ZSTD_compress2
* @return : number of sequences extracted * @return : number of sequences generated
*/ */
ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
size_t outSeqsSize, const void* src, size_t srcSize);
ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
size_t outSeqsSize, const void* src, size_t srcSize);
/*! ZSTD_mergeBlockDelimiters() :
* Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
* by merging them into into the literals of the next sequence.
*
* As such, the final generated result has no explicit representation of block boundaries,
* and the final last literals segment is not represented in the sequences.
* @return : number of sequences left after merging
*/
ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
/*************************************** /***************************************
* Memory management * Memory management

View File

@ -305,13 +305,17 @@ static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
#endif #endif
static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize, BYTE* src, size_t size) static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize,
BYTE* src, size_t size, ZSTD_sequenceFormat_e format)
{ {
size_t i; size_t i;
size_t j; size_t j;
for(i = 0; i < seqsSize; ++i) { for(i = 0; i < seqsSize; ++i) {
assert(dst + seqs[i].litLength + seqs[i].matchLength <= dst + size); assert(dst + seqs[i].litLength + seqs[i].matchLength <= dst + size);
assert(src + seqs[i].litLength + seqs[i].matchLength <= src + size); assert(src + seqs[i].litLength + seqs[i].matchLength <= src + size);
if (format == ZSTD_sf_noBlockDelimiters) {
assert(seqs[i].matchLength != 0 || seqs[i].offset != 0);
}
memcpy(dst, src, seqs[i].litLength); memcpy(dst, src, seqs[i].litLength);
dst += seqs[i].litLength; dst += seqs[i].litLength;
@ -326,6 +330,9 @@ static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize,
size -= seqs[i].matchLength; size -= seqs[i].matchLength;
} }
} }
if (format == ZSTD_sf_noBlockDelimiters) {
memcpy(dst, src, size);
}
} }
/*============================================= /*=============================================
@ -2702,9 +2709,9 @@ static int basicUnitTests(U32 const seed, double compressibility)
DISPLAYLEVEL(3, "OK \n"); DISPLAYLEVEL(3, "OK \n");
} }
DISPLAYLEVEL(3, "test%3i : ZSTD_getSequences decode from sequences test : ", testNb++); DISPLAYLEVEL(3, "test%3i : ZSTD_generateSequences decode from sequences test : ", testNb++);
{ {
size_t srcSize = 100 KB; size_t srcSize = 150 KB;
BYTE* src = (BYTE*)CNBuffer; BYTE* src = (BYTE*)CNBuffer;
BYTE* decoded = (BYTE*)compressedBuffer; BYTE* decoded = (BYTE*)compressedBuffer;
@ -2718,11 +2725,14 @@ static int basicUnitTests(U32 const seed, double compressibility)
/* Populate src with random data */ /* Populate src with random data */
RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed); RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed);
/* get the sequences */ /* Test with block delimiters roundtrip */
seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize); seqsSize = ZSTD_generateSequences(cctx, seqs, srcSize, src, srcSize);
FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters);
assert(!memcmp(CNBuffer, compressedBuffer, srcSize));
/* "decode" and compare the sequences */ /* Test no block delimiters roundtrip */
FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize); seqsSize = ZSTD_mergeBlockDelimiters(seqs, seqsSize);
FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_noBlockDelimiters);
assert(!memcmp(CNBuffer, compressedBuffer, srcSize)); assert(!memcmp(CNBuffer, compressedBuffer, srcSize));
ZSTD_freeCCtx(cctx); ZSTD_freeCCtx(cctx);