Merge pull request #654 from iburinoc/splittable

[RFC] Splittable Format and API
dev
Yann Collet 2017-05-08 13:41:56 -07:00 committed by GitHub
commit d47709b6ea
12 changed files with 1852 additions and 0 deletions

View File

@ -0,0 +1,4 @@
seekable_compression
seekable_decompression
parallel_processing
parallel_compression

View File

@ -0,0 +1,42 @@
# ################################################################
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
# ################################################################
# This Makefile presumes libzstd is built, using `make` in / or /lib/
LDFLAGS += ../../../lib/libzstd.a
CPPFLAGS += -I../ -I../../../lib -I../../../lib/common
CFLAGS ?= -O3
CFLAGS += -g
SEEKABLE_OBJS = ../zstdseek_compress.c ../zstdseek_decompress.c
.PHONY: default all clean test
default: all
all: seekable_compression seekable_decompression parallel_processing
seekable_compression : seekable_compression.c $(SEEKABLE_OBJS)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
seekable_decompression : seekable_decompression.c $(SEEKABLE_OBJS)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
parallel_processing : parallel_processing.c $(SEEKABLE_OBJS)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -pthread
parallel_compression : parallel_compression.c $(SEEKABLE_OBJS)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -pthread
clean:
@rm -f core *.o tmp* result* *.zst \
seekable_compression seekable_decompression \
parallel_processing parallel_compression
@echo Cleaning completed

View File

@ -0,0 +1,214 @@
/**
* Copyright 2017-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the license found in the
* LICENSE-examples file in the root directory of this source tree.
*/
#include <stdlib.h> // malloc, free, exit, atoi
#include <stdio.h> // fprintf, perror, feof, fopen, etc.
#include <string.h> // strlen, memset, strcat
#define ZSTD_STATIC_LINKING_ONLY
#include <zstd.h> // presumes zstd library is installed
#include <zstd_errors.h>
#if defined(WIN32) || defined(_WIN32)
# include <windows.h>
# define SLEEP(x) Sleep(x)
#else
# include <unistd.h>
# define SLEEP(x) usleep(x * 1000)
#endif
#define XXH_NAMESPACE ZSTD_
#include "xxhash.h"
#include "pool.h" // use zstd thread pool for demo
#include "zstd_seekable.h"
static void* malloc_orDie(size_t size)
{
void* const buff = malloc(size);
if (buff) return buff;
/* error */
perror("malloc:");
exit(1);
}
static FILE* fopen_orDie(const char *filename, const char *instruction)
{
FILE* const inFile = fopen(filename, instruction);
if (inFile) return inFile;
/* error */
perror(filename);
exit(3);
}
static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file)
{
size_t const readSize = fread(buffer, 1, sizeToRead, file);
if (readSize == sizeToRead) return readSize; /* good */
if (feof(file)) return readSize; /* good, reached end of file */
/* error */
perror("fread");
exit(4);
}
static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file)
{
size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file);
if (writtenSize == sizeToWrite) return sizeToWrite; /* good */
/* error */
perror("fwrite");
exit(5);
}
static size_t fclose_orDie(FILE* file)
{
if (!fclose(file)) return 0;
/* error */
perror("fclose");
exit(6);
}
static void fseek_orDie(FILE* file, long int offset, int origin)
{
if (!fseek(file, offset, origin)) {
if (!fflush(file)) return;
}
/* error */
perror("fseek");
exit(7);
}
static long int ftell_orDie(FILE* file)
{
long int off = ftell(file);
if (off != -1) return off;
/* error */
perror("ftell");
exit(8);
}
struct job {
const void* src;
size_t srcSize;
void* dst;
size_t dstSize;
unsigned checksum;
int compressionLevel;
int done;
};
static void compressFrame(void* opaque)
{
struct job* job = opaque;
job->checksum = XXH64(job->src, job->srcSize, 0);
size_t ret = ZSTD_compress(job->dst, job->dstSize, job->src, job->srcSize, job->compressionLevel);
if (ZSTD_isError(ret)) {
fprintf(stderr, "ZSTD_compress() error : %s \n", ZSTD_getErrorName(ret));
exit(20);
}
job->dstSize = ret;
job->done = 1;
}
static void compressFile_orDie(const char* fname, const char* outName, int cLevel, unsigned frameSize, int nbThreads)
{
POOL_ctx* pool = POOL_create(nbThreads, nbThreads);
if (pool == NULL) { fprintf(stderr, "POOL_create() error \n"); exit(9); }
FILE* const fin = fopen_orDie(fname, "rb");
FILE* const fout = fopen_orDie(outName, "wb");
if (ZSTD_compressBound(frameSize) > 0xFFFFFFFFU) { fprintf(stderr, "Frame size too large \n"); exit(10); }
unsigned dstSize = ZSTD_compressBound(frameSize);
fseek_orDie(fin, 0, SEEK_END);
long int length = ftell_orDie(fin);
fseek_orDie(fin, 0, SEEK_SET);
size_t numFrames = (length + frameSize - 1) / frameSize;
struct job* jobs = malloc_orDie(sizeof(struct job) * numFrames);
size_t i;
for(i = 0; i < numFrames; i++) {
void* in = malloc_orDie(frameSize);
void* out = malloc_orDie(dstSize);
size_t inSize = fread_orDie(in, frameSize, fin);
jobs[i].src = in;
jobs[i].srcSize = inSize;
jobs[i].dst = out;
jobs[i].dstSize = dstSize;
jobs[i].compressionLevel = cLevel;
jobs[i].done = 0;
POOL_add(pool, compressFrame, &jobs[i]);
}
ZSTD_frameLog* fl = ZSTD_seekable_createFrameLog(1);
if (fl == NULL) { fprintf(stderr, "ZSTD_seekable_createFrameLog() failed \n"); exit(11); }
for (i = 0; i < numFrames; i++) {
while (!jobs[i].done) SLEEP(5); /* wake up every 5 milliseconds to check */
fwrite_orDie(jobs[i].dst, jobs[i].dstSize, fout);
free((void*)jobs[i].src);
free(jobs[i].dst);
size_t ret = ZSTD_seekable_logFrame(fl, jobs[i].dstSize, jobs[i].srcSize, jobs[i].checksum);
if (ZSTD_isError(ret)) { fprintf(stderr, "ZSTD_seekable_logFrame() error : %s \n", ZSTD_getErrorName(ret)); }
}
{ unsigned char seekTableBuff[1024];
ZSTD_outBuffer out = {seekTableBuff, 1024, 0};
while (ZSTD_seekable_writeSeekTable(fl, &out) != 0) {
fwrite_orDie(seekTableBuff, out.pos, fout);
out.pos = 0;
}
fwrite_orDie(seekTableBuff, out.pos, fout);
}
ZSTD_seekable_freeFrameLog(fl);
free(jobs);
fclose_orDie(fout);
fclose_orDie(fin);
}
static const char* createOutFilename_orDie(const char* filename)
{
size_t const inL = strlen(filename);
size_t const outL = inL + 5;
void* outSpace = malloc_orDie(outL);
memset(outSpace, 0, outL);
strcat(outSpace, filename);
strcat(outSpace, ".zst");
return (const char*)outSpace;
}
int main(int argc, const char** argv) {
const char* const exeName = argv[0];
if (argc!=4) {
printf("wrong arguments\n");
printf("usage:\n");
printf("%s FILE FRAME_SIZE NB_THREADS\n", exeName);
return 1;
}
{ const char* const inFileName = argv[1];
unsigned const frameSize = (unsigned)atoi(argv[2]);
int const nbThreads = atoi(argv[3]);
const char* const outFileName = createOutFilename_orDie(inFileName);
compressFile_orDie(inFileName, outFileName, 5, frameSize, nbThreads);
}
return 0;
}

View File

@ -0,0 +1,193 @@
/**
* Copyright 2017-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the license found in the
* LICENSE-examples file in the root directory of this source tree.
*/
/*
* A simple demo that sums up all the bytes in the file in parallel using
* seekable decompression and the zstd thread pool
*/
#include <stdlib.h> // malloc, exit
#include <stdio.h> // fprintf, perror, feof
#include <string.h> // strerror
#include <errno.h> // errno
#define ZSTD_STATIC_LINKING_ONLY
#include <zstd.h> // presumes zstd library is installed
#include <zstd_errors.h>
#if defined(WIN32) || defined(_WIN32)
# include <windows.h>
# define SLEEP(x) Sleep(x)
#else
# include <unistd.h>
# define SLEEP(x) usleep(x * 1000)
#endif
#include "pool.h" // use zstd thread pool for demo
#include "zstd_seekable.h"
#define MIN(a, b) ((a) < (b) ? (a) : (b))
static void* malloc_orDie(size_t size)
{
void* const buff = malloc(size);
if (buff) return buff;
/* error */
perror("malloc");
exit(1);
}
static void* realloc_orDie(void* ptr, size_t size)
{
ptr = realloc(ptr, size);
if (ptr) return ptr;
/* error */
perror("realloc");
exit(1);
}
static FILE* fopen_orDie(const char *filename, const char *instruction)
{
FILE* const inFile = fopen(filename, instruction);
if (inFile) return inFile;
/* error */
perror(filename);
exit(3);
}
static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file)
{
size_t const readSize = fread(buffer, 1, sizeToRead, file);
if (readSize == sizeToRead) return readSize; /* good */
if (feof(file)) return readSize; /* good, reached end of file */
/* error */
perror("fread");
exit(4);
}
static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file)
{
size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file);
if (writtenSize == sizeToWrite) return sizeToWrite; /* good */
/* error */
perror("fwrite");
exit(5);
}
static size_t fclose_orDie(FILE* file)
{
if (!fclose(file)) return 0;
/* error */
perror("fclose");
exit(6);
}
static void fseek_orDie(FILE* file, long int offset, int origin) {
if (!fseek(file, offset, origin)) {
if (!fflush(file)) return;
}
/* error */
perror("fseek");
exit(7);
}
struct sum_job {
const char* fname;
unsigned long long sum;
unsigned frameNb;
int done;
};
static void sumFrame(void* opaque)
{
struct sum_job* job = (struct sum_job*)opaque;
job->done = 0;
FILE* const fin = fopen_orDie(job->fname, "rb");
ZSTD_seekable* const seekable = ZSTD_seekable_create();
if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); }
size_t const initResult = ZSTD_seekable_initFile(seekable, fin);
if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); }
size_t const frameSize = ZSTD_seekable_getFrameDecompressedSize(seekable, job->frameNb);
unsigned char* data = malloc_orDie(frameSize);
size_t result = ZSTD_seekable_decompressFrame(seekable, data, frameSize, job->frameNb);
if (ZSTD_isError(result)) { fprintf(stderr, "ZSTD_seekable_decompressFrame() error : %s \n", ZSTD_getErrorName(result)); exit(12); }
unsigned long long sum = 0;
size_t i;
for (i = 0; i < frameSize; i++) {
sum += data[i];
}
job->sum = sum;
job->done = 1;
fclose(fin);
ZSTD_seekable_free(seekable);
free(data);
}
static void sumFile_orDie(const char* fname, int nbThreads)
{
POOL_ctx* pool = POOL_create(nbThreads, nbThreads);
if (pool == NULL) { fprintf(stderr, "POOL_create() error \n"); exit(9); }
FILE* const fin = fopen_orDie(fname, "rb");
ZSTD_seekable* const seekable = ZSTD_seekable_create();
if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); }
size_t const initResult = ZSTD_seekable_initFile(seekable, fin);
if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); }
size_t const numFrames = ZSTD_seekable_getNumFrames(seekable);
struct sum_job* jobs = (struct sum_job*)malloc(numFrames * sizeof(struct sum_job));
size_t i;
for (i = 0; i < numFrames; i++) {
jobs[i] = (struct sum_job){ fname, 0, i, 0 };
POOL_add(pool, sumFrame, &jobs[i]);
}
unsigned long long total = 0;
for (i = 0; i < numFrames; i++) {
while (!jobs[i].done) SLEEP(5); /* wake up every 5 milliseconds to check */
total += jobs[i].sum;
}
printf("Sum: %llu\n", total);
POOL_free(pool);
ZSTD_seekable_free(seekable);
fclose(fin);
free(jobs);
}
int main(int argc, const char** argv)
{
const char* const exeName = argv[0];
if (argc!=3) {
fprintf(stderr, "wrong arguments\n");
fprintf(stderr, "usage:\n");
fprintf(stderr, "%s FILE NB_THREADS\n", exeName);
return 1;
}
{
const char* const inFilename = argv[1];
int const nbThreads = atoi(argv[2]);
sumFile_orDie(inFilename, nbThreads);
}
return 0;
}

View File

@ -0,0 +1,131 @@
/**
* Copyright 2017-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the license found in the
* LICENSE-examples file in the root directory of this source tree.
*/
#include <stdlib.h> // malloc, free, exit, atoi
#include <stdio.h> // fprintf, perror, feof, fopen, etc.
#include <string.h> // strlen, memset, strcat
#define ZSTD_STATIC_LINKING_ONLY
#include <zstd.h> // presumes zstd library is installed
#include "zstd_seekable.h"
static void* malloc_orDie(size_t size)
{
void* const buff = malloc(size);
if (buff) return buff;
/* error */
perror("malloc:");
exit(1);
}
static FILE* fopen_orDie(const char *filename, const char *instruction)
{
FILE* const inFile = fopen(filename, instruction);
if (inFile) return inFile;
/* error */
perror(filename);
exit(3);
}
static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file)
{
size_t const readSize = fread(buffer, 1, sizeToRead, file);
if (readSize == sizeToRead) return readSize; /* good */
if (feof(file)) return readSize; /* good, reached end of file */
/* error */
perror("fread");
exit(4);
}
static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file)
{
size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file);
if (writtenSize == sizeToWrite) return sizeToWrite; /* good */
/* error */
perror("fwrite");
exit(5);
}
static size_t fclose_orDie(FILE* file)
{
if (!fclose(file)) return 0;
/* error */
perror("fclose");
exit(6);
}
static void compressFile_orDie(const char* fname, const char* outName, int cLevel, unsigned frameSize)
{
FILE* const fin = fopen_orDie(fname, "rb");
FILE* const fout = fopen_orDie(outName, "wb");
size_t const buffInSize = ZSTD_CStreamInSize(); /* can always read one full block */
void* const buffIn = malloc_orDie(buffInSize);
size_t const buffOutSize = ZSTD_CStreamOutSize(); /* can always flush a full block */
void* const buffOut = malloc_orDie(buffOutSize);
ZSTD_seekable_CStream* const cstream = ZSTD_seekable_createCStream();
if (cstream==NULL) { fprintf(stderr, "ZSTD_seekable_createCStream() error \n"); exit(10); }
size_t const initResult = ZSTD_seekable_initCStream(cstream, cLevel, 1, frameSize);
if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_initCStream() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); }
size_t read, toRead = buffInSize;
while( (read = fread_orDie(buffIn, toRead, fin)) ) {
ZSTD_inBuffer input = { buffIn, read, 0 };
while (input.pos < input.size) {
ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
toRead = ZSTD_seekable_compressStream(cstream, &output , &input); /* toRead is guaranteed to be <= ZSTD_CStreamInSize() */
if (ZSTD_isError(toRead)) { fprintf(stderr, "ZSTD_seekable_compressStream() error : %s \n", ZSTD_getErrorName(toRead)); exit(12); }
if (toRead > buffInSize) toRead = buffInSize; /* Safely handle case when `buffInSize` is manually changed to a value < ZSTD_CStreamInSize()*/
fwrite_orDie(buffOut, output.pos, fout);
}
}
while (1) {
ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
size_t const remainingToFlush = ZSTD_seekable_endStream(cstream, &output); /* close stream */
if (ZSTD_isError(remainingToFlush)) { fprintf(stderr, "ZSTD_seekable_endStream() error : %s \n", ZSTD_getErrorName(remainingToFlush)); exit(13); }
fwrite_orDie(buffOut, output.pos, fout);
if (!remainingToFlush) break;
}
ZSTD_seekable_freeCStream(cstream);
fclose_orDie(fout);
fclose_orDie(fin);
free(buffIn);
free(buffOut);
}
static const char* createOutFilename_orDie(const char* filename)
{
size_t const inL = strlen(filename);
size_t const outL = inL + 5;
void* outSpace = malloc_orDie(outL);
memset(outSpace, 0, outL);
strcat(outSpace, filename);
strcat(outSpace, ".zst");
return (const char*)outSpace;
}
int main(int argc, const char** argv) {
const char* const exeName = argv[0];
if (argc!=3) {
printf("wrong arguments\n");
printf("usage:\n");
printf("%s FILE FRAME_SIZE\n", exeName);
return 1;
}
{ const char* const inFileName = argv[1];
unsigned const frameSize = (unsigned)atoi(argv[2]);
const char* const outFileName = createOutFilename_orDie(inFileName);
compressFile_orDie(inFileName, outFileName, 5, frameSize);
}
return 0;
}

View File

@ -0,0 +1,137 @@
/**
* Copyright 2016-present, Yann Collet, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the license found in the
* LICENSE-examples file in the root directory of this source tree.
*/
#include <stdlib.h> // malloc, exit
#include <stdio.h> // fprintf, perror, feof
#include <string.h> // strerror
#include <errno.h> // errno
#define ZSTD_STATIC_LINKING_ONLY
#include <zstd.h> // presumes zstd library is installed
#include <zstd_errors.h>
#include "zstd_seekable.h"
#define MIN(a, b) ((a) < (b) ? (a) : (b))
static void* malloc_orDie(size_t size)
{
void* const buff = malloc(size);
if (buff) return buff;
/* error */
perror("malloc");
exit(1);
}
static void* realloc_orDie(void* ptr, size_t size)
{
ptr = realloc(ptr, size);
if (ptr) return ptr;
/* error */
perror("realloc");
exit(1);
}
static FILE* fopen_orDie(const char *filename, const char *instruction)
{
FILE* const inFile = fopen(filename, instruction);
if (inFile) return inFile;
/* error */
perror(filename);
exit(3);
}
static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file)
{
size_t const readSize = fread(buffer, 1, sizeToRead, file);
if (readSize == sizeToRead) return readSize; /* good */
if (feof(file)) return readSize; /* good, reached end of file */
/* error */
perror("fread");
exit(4);
}
static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file)
{
size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file);
if (writtenSize == sizeToWrite) return sizeToWrite; /* good */
/* error */
perror("fwrite");
exit(5);
}
static size_t fclose_orDie(FILE* file)
{
if (!fclose(file)) return 0;
/* error */
perror("fclose");
exit(6);
}
static void fseek_orDie(FILE* file, long int offset, int origin) {
if (!fseek(file, offset, origin)) {
if (!fflush(file)) return;
}
/* error */
perror("fseek");
exit(7);
}
static void decompressFile_orDie(const char* fname, unsigned startOffset, unsigned endOffset)
{
FILE* const fin = fopen_orDie(fname, "rb");
FILE* const fout = stdout;
size_t const buffOutSize = ZSTD_DStreamOutSize(); /* Guarantee to successfully flush at least one complete compressed block in all circumstances. */
void* const buffOut = malloc_orDie(buffOutSize);
ZSTD_seekable* const seekable = ZSTD_seekable_create();
if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); }
size_t const initResult = ZSTD_seekable_initFile(seekable, fin);
if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); }
while (startOffset < endOffset) {
size_t const result = ZSTD_seekable_decompress(seekable, buffOut, MIN(endOffset - startOffset, buffOutSize), startOffset);
if (ZSTD_isError(result)) {
fprintf(stderr, "ZSTD_seekable_decompress() error : %s \n",
ZSTD_getErrorName(result));
exit(12);
}
fwrite_orDie(buffOut, result, fout);
startOffset += result;
}
ZSTD_seekable_free(seekable);
fclose_orDie(fin);
fclose_orDie(fout);
free(buffOut);
}
int main(int argc, const char** argv)
{
const char* const exeName = argv[0];
if (argc!=4) {
fprintf(stderr, "wrong arguments\n");
fprintf(stderr, "usage:\n");
fprintf(stderr, "%s FILE START END\n", exeName);
return 1;
}
{
const char* const inFilename = argv[1];
unsigned const startOffset = (unsigned) atoi(argv[2]);
unsigned const endOffset = (unsigned) atoi(argv[3]);
decompressFile_orDie(inFilename, startOffset, endOffset);
}
return 0;
}

View File

@ -0,0 +1,184 @@
#ifndef SEEKABLE_H
#define SEEKABLE_H
#if defined (__cplusplus)
extern "C" {
#endif
#include <stdio.h>
static const unsigned ZSTD_seekTableFooterSize = 9;
#define ZSTD_SEEKABLE_MAGICNUMBER 0x8F92EAB1
#define ZSTD_SEEKABLE_MAXFRAMES 0x8000000U
/* Limit the maximum size to avoid any potential issues storing the compressed size */
#define ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE 0x80000000U
/*-****************************************************************************
* Seekable Format
*
* The seekable format splits the compressed data into a series of "frames",
* each compressed individually so that decompression of a section in the
* middle of an archive only requires zstd to decompress at most a frame's
* worth of extra data, instead of the entire archive.
******************************************************************************/
typedef struct ZSTD_seekable_CStream_s ZSTD_seekable_CStream;
typedef struct ZSTD_seekable_s ZSTD_seekable;
/*-****************************************************************************
* Seekable compression - HowTo
* A ZSTD_seekable_CStream object is required to tracking streaming operation.
* Use ZSTD_seekable_createCStream() and ZSTD_seekable_freeCStream() to create/
* release resources.
*
* Streaming objects are reusable to avoid allocation and deallocation,
* to start a new compression operation call ZSTD_seekable_initCStream() on the
* compressor.
*
* Data streamed to the seekable compressor will automatically be split into
* frames of size `maxFrameSize` (provided in ZSTD_seekable_initCStream()),
* or if none is provided, will be cut off whenever ZSTD_seekable_endFrame() is
* called or when the default maximum frame size (2GB) is reached.
*
* Use ZSTD_seekable_initCStream() to initialize a ZSTD_seekable_CStream object
* for a new compression operation.
* `maxFrameSize` indicates the size at which to automatically start a new
* seekable frame. `maxFrameSize == 0` implies the default maximum size.
* `checksumFlag` indicates whether or not the seek table should include frame
* checksums on the uncompressed data for verification.
* @return : a size hint for input to provide for compression, or an error code
* checkable with ZSTD_isError()
*
* Use ZSTD_seekable_compressStream() repetitively to consume input stream.
* The function will automatically update both `pos` fields.
* Note that it may not consume the entire input, in which case `pos < size`,
* and it's up to the caller to present again remaining data.
* @return : a size hint, preferred nb of bytes to use as input for next
* function call or an error code, which can be tested using
* ZSTD_isError().
* Note 1 : it's just a hint, to help latency a little, any other
* value will work fine.
*
* At any time, call ZSTD_seekable_endFrame() to end the current frame and
* start a new one.
*
* ZSTD_seekable_endStream() will end the current frame, and then write the seek
* table so that decompressors can efficiently find compressed frames.
* ZSTD_seekable_endStream() may return a number > 0 if it was unable to flush
* all the necessary data to `output`. In this case, it should be called again
* until all remaining data is flushed out and 0 is returned.
******************************************************************************/
/*===== Seekable compressor management =====*/
ZSTDLIB_API ZSTD_seekable_CStream* ZSTD_seekable_createCStream(void);
ZSTDLIB_API size_t ZSTD_seekable_freeCStream(ZSTD_seekable_CStream* zcs);
/*===== Seekable compression functions =====*/
ZSTDLIB_API size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs, int compressionLevel, int checksumFlag, unsigned maxFrameSize);
ZSTDLIB_API size_t ZSTD_seekable_compressStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
ZSTDLIB_API size_t ZSTD_seekable_endFrame(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output);
ZSTDLIB_API size_t ZSTD_seekable_endStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output);
/*= Raw seek table API
* These functions allow for the seek table to be constructed directly.
* This table can then be appended to a file of concatenated frames.
* This allows the frames to be compressed independently, even in parallel,
* and compiled together afterward into a seekable archive.
*
* Use ZSTD_seekable_createFrameLog() to allocate and initialize a tracking
* structure.
*
* Call ZSTD_seekable_logFrame() once for each frame in the archive.
* checksum is optional, and will not be used if checksumFlag was 0 when the
* frame log was created. If present, it should be the least significant 32
* bits of the XXH64 hash of the uncompressed data.
*
* Call ZSTD_seekable_writeSeekTable to serialize the data into a seek table.
* If the entire table was written, the return value will be 0. Otherwise,
* it will be equal to the number of bytes left to write. */
typedef struct ZSTD_frameLog_s ZSTD_frameLog;
ZSTDLIB_API ZSTD_frameLog* ZSTD_seekable_createFrameLog(int checksumFlag);
ZSTDLIB_API size_t ZSTD_seekable_freeFrameLog(ZSTD_frameLog* fl);
ZSTDLIB_API size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl, unsigned compressedSize, unsigned decompressedSize, unsigned checksum);
ZSTDLIB_API size_t ZSTD_seekable_writeSeekTable(ZSTD_frameLog* fl, ZSTD_outBuffer* output);
/*-****************************************************************************
* Seekable decompression - HowTo
* A ZSTD_seekable object is required to tracking the seekTable.
*
* Call ZSTD_seekable_init* to initialize a ZSTD_seekable object with the
* the seek table provided in the input.
* There are three modes for ZSTD_seekable_init:
* - ZSTD_seekable_initBuff() : An in-memory API. The data contained in
* `src` should be the entire seekable file, including the seek table.
* `src` should be kept alive and unmodified until the ZSTD_seekable object
* is freed or reset.
* - ZSTD_seekable_initFile() : A simplified file API using stdio. fread and
* fseek will be used to access the required data for building the seek
* table and doing decompression operations. `src` should not be closed
* or modified until the ZSTD_seekable object is freed or reset.
* - ZSTD_seekable_initAdvanced() : A general API allowing the client to
* provide its own read and seek callbacks.
* + ZSTD_seekable_read() : read exactly `n` bytes into `buffer`.
* Premature EOF should be treated as an error.
* + ZSTD_seekable_seek() : seek the read head to `offset` from `origin`,
* where origin is either SEEK_SET (beginning of
* file), or SEEK_END (end of file).
* Both functions should return a non-negative value in case of success, and a
* negative value in case of failure. If implementing using this API and
* stdio, be careful with files larger than 4GB and fseek. All of these
* functions return an error code checkable with ZSTD_isError().
*
* Call ZSTD_seekable_decompress to decompress `dstSize` bytes at decompressed
* offset `offset`. ZSTD_seekable_decompress may have to decompress the entire
* prefix of the frame before the desired data if it has not already processed
* this section. If ZSTD_seekable_decompress is called multiple times for a
* consecutive range of data, it will efficiently retain the decompressor object
* and avoid redecompressing frame prefixes. The return value is the number of
* bytes decompressed, or an error code checkable with ZSTD_isError().
*
* The seek table access functions can be used to obtain the data contained
* in the seek table. If frameIndex is larger than the value returned by
* ZSTD_seekable_getNumFrames(), they will return error codes checkable with
* ZSTD_isError(). Note that since the offset access functions return
* unsigned long long instead of size_t, in this case they will instead return
* the value ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE.
******************************************************************************/
/*===== Seekable decompressor management =====*/
ZSTDLIB_API ZSTD_seekable* ZSTD_seekable_create(void);
ZSTDLIB_API size_t ZSTD_seekable_free(ZSTD_seekable* zs);
/*===== Seekable decompression functions =====*/
ZSTDLIB_API size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize);
ZSTDLIB_API size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src);
ZSTDLIB_API size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned long long offset);
ZSTDLIB_API size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned frameIndex);
#define ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE (0ULL-2)
/*===== Seek Table access functions =====*/
ZSTDLIB_API unsigned ZSTD_seekable_getNumFrames(ZSTD_seekable* const zs);
ZSTDLIB_API unsigned long long ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex);
ZSTDLIB_API unsigned long long ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex);
ZSTDLIB_API size_t ZSTD_seekable_getFrameCompressedSize(ZSTD_seekable* const zs, unsigned frameIndex);
ZSTDLIB_API size_t ZSTD_seekable_getFrameDecompressedSize(ZSTD_seekable* const zs, unsigned frameIndex);
ZSTDLIB_API unsigned ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, unsigned long long offset);
/*===== Seekable advanced I/O API =====*/
typedef int(ZSTD_seekable_read)(void* opaque, void* buffer, size_t n);
typedef int(ZSTD_seekable_seek)(void* opaque, long long offset, int origin);
typedef struct {
void* opaque;
ZSTD_seekable_read* read;
ZSTD_seekable_seek* seek;
} ZSTD_seekable_customFile;
ZSTDLIB_API size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src);
#if defined (__cplusplus)
}
#endif
#endif

View File

@ -0,0 +1,116 @@
# Zstandard Seekable Format
### Notices
Copyright (c) 2017-present Facebook, Inc.
Permission is granted to copy and distribute this document
for any purpose and without charge,
including translations into other languages
and incorporation into compilations,
provided that the copyright notice and this notice are preserved,
and that any substantive changes or deletions from the original
are clearly marked.
Distribution of this document is unlimited.
### Version
0.1.0 (11/04/17)
## Introduction
This document defines a format for compressed data to be stored so that subranges of the data can be efficiently decompressed without requiring the entire document to be decompressed.
This is done by splitting up the input data into frames,
each of which are compressed independently,
and so can be decompressed independently.
Decompression then takes advantage of a provided 'seek table', which allows the decompressor to immediately jump to the desired data. This is done in a way that is compatible with the original Zstandard format by placing the seek table in a Zstandard skippable frame.
### Overall conventions
In this document:
- square brackets i.e. `[` and `]` are used to indicate optional fields or parameters.
- the naming convention for identifiers is `Mixed_Case_With_Underscores`
- All numeric fields are little-endian unless specified otherwise
## Format
The format consists of a number of frames (Zstandard compressed frames and skippable frames), followed by a final skippable frame at the end containing the seek table.
### Seek Table Format
The structure of the seek table frame is as follows:
|`Skippable_Magic_Number`|`Frame_Size`|`[Seek_Table_Entries]`|`Seek_Table_Footer`|
|------------------------|------------|----------------------|-------------------|
| 4 bytes | 4 bytes | 8-12 bytes each | 9 bytes |
__`Skippable_Magic_Number`__
Value : 0x184D2A5E.
This is for compatibility with [Zstandard skippable frames].
Since it is legal for other Zstandard skippable frames to use the same
magic number, it is not recommended for a decoder to recognize frames
solely on this.
__`Frame_Size`__
The total size of the skippable frame, not including the `Skippable_Magic_Number` or `Frame_Size`.
This is for compatibility with [Zstandard skippable frames].
[Zstandard skippable frames]: https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#skippable-frames
#### `Seek_Table_Footer`
The seek table footer format is as follows:
|`Number_Of_Frames`|`Seek_Table_Descriptor`|`Seekable_Magic_Number`|
|------------------|-----------------------|-----------------------|
| 4 bytes | 1 byte | 4 bytes |
__`Seekable_Magic_Number`__
Value : 0x8F92EAB1.
This value must be the last bytes present in the compressed file so that decoders
can efficiently find it and determine if there is an actual seek table present.
__`Number_Of_Frames`__
The number of stored frames in the data.
__`Seek_Table_Descriptor`__
A bitfield describing the format of the seek table.
| Bit number | Field name |
| ---------- | ---------- |
| 7 | `Checksum_Flag` |
| 6-2 | `Reserved_Bits` |
| 1-0 | `Unused_Bits` |
While only `Checksum_Flag` currently exists, there are 7 other bits in this field that can be used for future changes to the format,
for example the addition of inline dictionaries.
__`Checksum_Flag`__
If the checksum flag is set, each of the seek table entries contains a 4 byte checksum of the uncompressed data contained in its frame.
`Reserved_Bits` are not currently used but may be used in the future for breaking changes, so a compliant decoder should ensure they are set to 0. `Unused_Bits` may be used in the future for non-breaking changes, so a compliant decoder should not interpret these bits.
#### __`Seek_Table_Entries`__
`Seek_Table_Entries` consists of `Number_Of_Frames` (one for each frame in the data, not including the seek table frame) entries of the following form, in sequence:
|`Compressed_Size`|`Decompressed_Size`|`[Checksum]`|
|-----------------|-------------------|------------|
| 4 bytes | 4 bytes | 4 bytes |
__`Compressed_Size`__
The compressed size of the frame.
The cumulative sum of the `Compressed_Size` fields of frames `0` to `i` gives the offset in the compressed file of frame `i+1`.
__`Decompressed_Size`__
The size of the decompressed data contained in the frame. For skippable or otherwise empty frames, this value is 0.
__`Checksum`__
Only present if `Checksum_Flag` is set in the `Seek_Table_Descriptor`. Value : the least significant 32 bits of the XXH64 digest of the uncompressed data, stored in little-endian format.
## Version Changes
- 0.1.0: initial version

View File

@ -0,0 +1,366 @@
/**
* Copyright (c) 2017-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*/
#include <stdlib.h> /* malloc, free */
#define XXH_STATIC_LINKING_ONLY
#define XXH_NAMESPACE ZSTD_
#include "xxhash.h"
#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"
#include "zstd_errors.h"
#include "mem.h"
#include "zstd_seekable.h"
#define CHECK_Z(f) { size_t const ret = (f); if (ret != 0) return ret; }
#undef ERROR
#define ERROR(name) ((size_t)-ZSTD_error_##name)
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
typedef struct {
U32 cSize;
U32 dSize;
U32 checksum;
} framelogEntry_t;
struct ZSTD_frameLog_s {
framelogEntry_t* entries;
U32 size;
U32 capacity;
int checksumFlag;
/* for use when streaming out the seek table */
U32 seekTablePos;
U32 seekTableIndex;
} framelog_t;
struct ZSTD_seekable_CStream_s {
ZSTD_CStream* cstream;
ZSTD_frameLog framelog;
U32 frameCSize;
U32 frameDSize;
XXH64_state_t xxhState;
U32 maxFrameSize;
int writingSeekTable;
};
size_t ZSTD_seekable_frameLog_allocVec(ZSTD_frameLog* fl)
{
/* allocate some initial space */
size_t const FRAMELOG_STARTING_CAPACITY = 16;
fl->entries = (framelogEntry_t*)malloc(
sizeof(framelogEntry_t) * FRAMELOG_STARTING_CAPACITY);
if (fl->entries == NULL) return ERROR(memory_allocation);
fl->capacity = FRAMELOG_STARTING_CAPACITY;
return 0;
}
size_t ZSTD_seekable_frameLog_freeVec(ZSTD_frameLog* fl)
{
if (fl != NULL) free(fl->entries);
return 0;
}
ZSTD_frameLog* ZSTD_seekable_createFrameLog(int checksumFlag)
{
ZSTD_frameLog* fl = malloc(sizeof(ZSTD_frameLog));
if (fl == NULL) return NULL;
if (ZSTD_isError(ZSTD_seekable_frameLog_allocVec(fl))) {
free(fl);
return NULL;
}
fl->checksumFlag = checksumFlag;
fl->seekTablePos = 0;
fl->seekTableIndex = 0;
fl->size = 0;
return fl;
}
size_t ZSTD_seekable_freeFrameLog(ZSTD_frameLog* fl)
{
ZSTD_seekable_frameLog_freeVec(fl);
free(fl);
return 0;
}
ZSTD_seekable_CStream* ZSTD_seekable_createCStream()
{
ZSTD_seekable_CStream* zcs = malloc(sizeof(ZSTD_seekable_CStream));
if (zcs == NULL) return NULL;
memset(zcs, 0, sizeof(*zcs));
zcs->cstream = ZSTD_createCStream();
if (zcs->cstream == NULL) goto failed1;
if (ZSTD_isError(ZSTD_seekable_frameLog_allocVec(&zcs->framelog))) goto failed2;
return zcs;
failed2:
ZSTD_freeCStream(zcs->cstream);
failed1:
free(zcs);
return NULL;
}
size_t ZSTD_seekable_freeCStream(ZSTD_seekable_CStream* zcs)
{
if (zcs == NULL) return 0; /* support free on null */
ZSTD_freeCStream(zcs->cstream);
ZSTD_seekable_frameLog_freeVec(&zcs->framelog);
free(zcs);
return 0;
}
size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs,
int compressionLevel,
int checksumFlag,
U32 maxFrameSize)
{
zcs->framelog.size = 0;
zcs->frameCSize = 0;
zcs->frameDSize = 0;
/* make sure maxFrameSize has a reasonable value */
if (maxFrameSize > ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE) {
return ERROR(compressionParameter_unsupported);
}
zcs->maxFrameSize = maxFrameSize
? maxFrameSize
: ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE;
zcs->framelog.checksumFlag = checksumFlag;
if (zcs->framelog.checksumFlag) {
XXH64_reset(&zcs->xxhState, 0);
}
zcs->framelog.seekTablePos = 0;
zcs->framelog.seekTableIndex = 0;
zcs->writingSeekTable = 0;
return ZSTD_initCStream(zcs->cstream, compressionLevel);
}
size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl,
unsigned compressedSize,
unsigned decompressedSize,
unsigned checksum)
{
if (fl->size == ZSTD_SEEKABLE_MAXFRAMES)
return ERROR(frameIndex_tooLarge);
/* grow the buffer if required */
if (fl->size == fl->capacity) {
/* exponential size increase for constant amortized runtime */
size_t const newCapacity = fl->capacity * 2;
framelogEntry_t* const newEntries = realloc(fl->entries,
sizeof(framelogEntry_t) * newCapacity);
if (newEntries == NULL) return ERROR(memory_allocation);
fl->entries = newEntries;
fl->capacity = newCapacity;
}
fl->entries[fl->size] = (framelogEntry_t){
compressedSize, decompressedSize, checksum
};
fl->size++;
return 0;
}
size_t ZSTD_seekable_endFrame(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output)
{
size_t const prevOutPos = output->pos;
/* end the frame */
size_t ret = ZSTD_endStream(zcs->cstream, output);
zcs->frameCSize += output->pos - prevOutPos;
/* need to flush before doing the rest */
if (ret) return ret;
/* frame done */
/* store the frame data for later */
ret = ZSTD_seekable_logFrame(
&zcs->framelog, zcs->frameCSize, zcs->frameDSize,
zcs->framelog.checksumFlag
? XXH64_digest(&zcs->xxhState) & 0xFFFFFFFFU
: 0);
if (ret) return ret;
/* reset for the next frame */
zcs->frameCSize = 0;
zcs->frameDSize = 0;
ZSTD_resetCStream(zcs->cstream, 0);
if (zcs->framelog.checksumFlag)
XXH64_reset(&zcs->xxhState, 0);
return 0;
}
size_t ZSTD_seekable_compressStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
{
const BYTE* const inBase = (const BYTE*) input->src + input->pos;
size_t inLen = input->size - input->pos;
inLen = MIN(inLen, (size_t)(zcs->maxFrameSize - zcs->frameDSize));
/* if we haven't finished flushing the last frame, don't start writing a new one */
if (inLen > 0) {
ZSTD_inBuffer inTmp = { inBase, inLen, 0 };
size_t const prevOutPos = output->pos;
size_t const ret = ZSTD_compressStream(zcs->cstream, output, &inTmp);
if (zcs->framelog.checksumFlag) {
XXH64_update(&zcs->xxhState, inBase, inTmp.pos);
}
zcs->frameCSize += output->pos - prevOutPos;
zcs->frameDSize += inTmp.pos;
input->pos += inTmp.pos;
if (ZSTD_isError(ret)) return ret;
}
if (zcs->maxFrameSize == zcs->frameDSize) {
/* log the frame and start over */
size_t const ret = ZSTD_seekable_endFrame(zcs, output);
if (ZSTD_isError(ret)) return ret;
/* get the client ready for the next frame */
return (size_t)zcs->maxFrameSize;
}
return (size_t)(zcs->maxFrameSize - zcs->frameDSize);
}
static inline size_t ZSTD_seekable_seekTableSize(const ZSTD_frameLog* fl)
{
size_t const sizePerFrame = 8 + (fl->checksumFlag?4:0);
size_t const seekTableLen = ZSTD_skippableHeaderSize +
sizePerFrame * fl->size +
ZSTD_seekTableFooterSize;
return seekTableLen;
}
static inline size_t ZSTD_stwrite32(ZSTD_frameLog* fl,
ZSTD_outBuffer* output, U32 const value,
U32 const offset)
{
if (fl->seekTablePos < offset + 4) {
BYTE tmp[4]; /* so that we can work with buffers too small to write a whole word to */
size_t const lenWrite =
MIN(output->size - output->pos, offset + 4 - fl->seekTablePos);
MEM_writeLE32(tmp, value);
memcpy((BYTE*)output->dst + output->pos,
tmp + (fl->seekTablePos - offset), lenWrite);
output->pos += lenWrite;
fl->seekTablePos += lenWrite;
if (lenWrite < 4) return ZSTD_seekable_seekTableSize(fl) - fl->seekTablePos;
}
return 0;
}
size_t ZSTD_seekable_writeSeekTable(ZSTD_frameLog* fl, ZSTD_outBuffer* output)
{
/* seekTableIndex: the current index in the table and
* seekTableSize: the amount of the table written so far
*
* This function is written this way so that if it has to return early
* because of a small buffer, it can keep going where it left off.
*/
size_t const sizePerFrame = 8 + (fl->checksumFlag?4:0);
size_t const seekTableLen = ZSTD_seekable_seekTableSize(fl);
CHECK_Z(ZSTD_stwrite32(fl, output, ZSTD_MAGIC_SKIPPABLE_START | 0xE, 0));
CHECK_Z(ZSTD_stwrite32(fl, output, seekTableLen - ZSTD_skippableHeaderSize,
4));
while (fl->seekTableIndex < fl->size) {
CHECK_Z(ZSTD_stwrite32(fl, output,
fl->entries[fl->seekTableIndex].cSize,
ZSTD_skippableHeaderSize +
sizePerFrame * fl->seekTableIndex + 0));
CHECK_Z(ZSTD_stwrite32(fl, output,
fl->entries[fl->seekTableIndex].dSize,
ZSTD_skippableHeaderSize +
sizePerFrame * fl->seekTableIndex + 4));
if (fl->checksumFlag) {
CHECK_Z(ZSTD_stwrite32(
fl, output, fl->entries[fl->seekTableIndex].checksum,
ZSTD_skippableHeaderSize +
sizePerFrame * fl->seekTableIndex + 8));
}
fl->seekTableIndex++;
}
CHECK_Z(ZSTD_stwrite32(fl, output, fl->size,
seekTableLen - ZSTD_seekTableFooterSize));
if (output->size - output->pos < 1) return seekTableLen - fl->seekTablePos;
if (fl->seekTablePos < seekTableLen - 4) {
BYTE sfd = 0;
sfd |= (fl->checksumFlag) << 7;
((BYTE*)output->dst)[output->pos] = sfd;
output->pos++;
fl->seekTablePos++;
}
CHECK_Z(ZSTD_stwrite32(fl, output, ZSTD_SEEKABLE_MAGICNUMBER,
seekTableLen - 4));
if (fl->seekTablePos != seekTableLen) return ERROR(GENERIC);
return 0;
}
size_t ZSTD_seekable_endStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output)
{
if (!zcs->writingSeekTable && zcs->frameDSize) {
const size_t endFrame = ZSTD_seekable_endFrame(zcs, output);
if (ZSTD_isError(endFrame)) return endFrame;
/* return an accurate size hint */
if (endFrame) return endFrame + ZSTD_seekable_seekTableSize(&zcs->framelog);
}
zcs->writingSeekTable = 1;
return ZSTD_seekable_writeSeekTable(&zcs->framelog, output);
}

View File

@ -0,0 +1,461 @@
/*
* Copyright (c) 2017-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*/
/* *********************************************************
* Turn on Large Files support (>4GB) for 32-bit Linux/Unix
***********************************************************/
#if !defined(__64BIT__) || defined(__MINGW32__) /* No point defining Large file for 64 bit but MinGW-w64 requires it */
# if !defined(_FILE_OFFSET_BITS)
# define _FILE_OFFSET_BITS 64 /* turn off_t into a 64-bit type for ftello, fseeko */
# endif
# if !defined(_LARGEFILE_SOURCE) /* obsolete macro, replaced with _FILE_OFFSET_BITS */
# define _LARGEFILE_SOURCE 1 /* Large File Support extension (LFS) - fseeko, ftello */
# endif
# if defined(_AIX) || defined(__hpux)
# define _LARGE_FILES /* Large file support on 32-bits AIX and HP-UX */
# endif
#endif
/* ************************************************************
* Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW
***************************************************************/
#if defined(_MSC_VER) && _MSC_VER >= 1400
# define LONG_SEEK _fseeki64
#elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */
# define LONG_SEEK fseeko
#elif defined(__MINGW32__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS) && defined(__MSVCRT__)
# define LONG_SEEK fseeko64
#elif defined(_WIN32) && !defined(__DJGPP__)
# include <windows.h>
static int LONG_SEEK(FILE* file, __int64 offset, int origin) {
LARGE_INTEGER off;
DWORD method;
off.QuadPart = offset;
if (origin == SEEK_END)
method = FILE_END;
else if (origin == SEEK_CUR)
method = FILE_CURRENT;
else
method = FILE_BEGIN;
if (SetFilePointerEx((HANDLE) _get_osfhandle(_fileno(file)), off, NULL, method))
return 0;
else
return -1;
}
#else
# define LONG_SEEK fseek
#endif
#include <stdlib.h> /* malloc, free */
#include <stdio.h> /* FILE* */
#define XXH_STATIC_LINKING_ONLY
#define XXH_NAMESPACE ZSTD_
#include "xxhash.h"
#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"
#include "zstd_errors.h"
#include "mem.h"
#include "zstd_seekable.h"
#undef ERROR
#define ERROR(name) ((size_t)-ZSTD_error_##name)
#define CHECK_IO(f) { int const errcod = (f); if (errcod < 0) return ERROR(seekableIO); }
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
/* Special-case callbacks for FILE* and in-memory modes, so that we can treat
* them the same way as the advanced API */
static int ZSTD_seekable_read_FILE(void* opaque, void* buffer, size_t n)
{
size_t const result = fread(buffer, 1, n, (FILE*)opaque);
if (result != n) {
return -1;
}
return 0;
}
static int ZSTD_seekable_seek_FILE(void* opaque, S64 offset, int origin)
{
int const ret = LONG_SEEK((FILE*)opaque, offset, origin);
if (ret) return ret;
return fflush((FILE*)opaque);
}
typedef struct {
const void *ptr;
size_t size;
size_t pos;
} buffWrapper_t;
static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n)
{
buffWrapper_t* buff = (buffWrapper_t*) opaque;
if (buff->size + n > buff->pos) return -1;
memcpy(buffer, (const BYTE*)buff->ptr + buff->pos, n);
buff->pos += n;
return 0;
}
static int ZSTD_seekable_seek_buff(void* opaque, S64 offset, int origin)
{
buffWrapper_t* buff = (buffWrapper_t*) opaque;
unsigned long long newOffset;
switch (origin) {
case SEEK_SET:
newOffset = offset;
break;
case SEEK_CUR:
newOffset = (unsigned long long)buff->pos + offset;
break;
case SEEK_END:
newOffset = (unsigned long long)buff->size - offset;
break;
}
if (newOffset < 0 || newOffset > buff->size) {
return -1;
}
buff->pos = newOffset;
return 0;
}
typedef struct {
U64 cOffset;
U64 dOffset;
U32 checksum;
} seekEntry_t;
typedef struct {
seekEntry_t* entries;
size_t tableLen;
int checksumFlag;
} seekTable_t;
#define SEEKABLE_BUFF_SIZE ZSTD_BLOCKSIZE_ABSOLUTEMAX
struct ZSTD_seekable_s {
ZSTD_DStream* dstream;
seekTable_t seekTable;
ZSTD_seekable_customFile src;
U64 decompressedOffset;
U32 curFrame;
BYTE inBuff[SEEKABLE_BUFF_SIZE]; /* need to do our own input buffering */
BYTE outBuff[SEEKABLE_BUFF_SIZE]; /* so we can efficiently decompress the
starts of chunks before we get to the
desired section */
ZSTD_inBuffer in; /* maintain continuity across ZSTD_seekable_decompress operations */
buffWrapper_t buffWrapper; /* for `src.opaque` in in-memory mode */
XXH64_state_t xxhState;
};
ZSTD_seekable* ZSTD_seekable_create(void)
{
ZSTD_seekable* zs = malloc(sizeof(ZSTD_seekable));
if (zs == NULL) return NULL;
/* also initializes stage to zsds_init */
memset(zs, 0, sizeof(*zs));
zs->dstream = ZSTD_createDStream();
if (zs->dstream == NULL) {
free(zs);
return NULL;
}
return zs;
}
size_t ZSTD_seekable_free(ZSTD_seekable* zs)
{
if (zs == NULL) return 0; /* support free on null */
ZSTD_freeDStream(zs->dstream);
free(zs->seekTable.entries);
free(zs);
return 0;
}
/** ZSTD_seekable_offsetToFrameIndex() :
* Performs a binary search to find the last frame with a decompressed offset
* <= pos
* @return : the frame's index */
U32 ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, U64 pos)
{
U32 lo = 0;
U32 hi = zs->seekTable.tableLen;
if (pos >= zs->seekTable.entries[zs->seekTable.tableLen].dOffset) {
return zs->seekTable.tableLen;
}
while (lo + 1 < hi) {
U32 const mid = lo + ((hi - lo) >> 1);
if (zs->seekTable.entries[mid].dOffset <= pos) {
lo = mid;
} else {
hi = mid;
}
}
return lo;
}
U32 ZSTD_seekable_getNumFrames(ZSTD_seekable* const zs)
{
return zs->seekTable.tableLen;
}
U64 ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, U32 frameIndex)
{
if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE;
return zs->seekTable.entries[frameIndex].cOffset;
}
U64 ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, U32 frameIndex)
{
if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE;
return zs->seekTable.entries[frameIndex].dOffset;
}
size_t ZSTD_seekable_getFrameCompressedSize(ZSTD_seekable* const zs, U32 frameIndex)
{
if (frameIndex >= zs->seekTable.tableLen) return ERROR(frameIndex_tooLarge);
return zs->seekTable.entries[frameIndex + 1].cOffset -
zs->seekTable.entries[frameIndex].cOffset;
}
size_t ZSTD_seekable_getFrameDecompressedSize(ZSTD_seekable* const zs, U32 frameIndex)
{
if (frameIndex > zs->seekTable.tableLen) return ERROR(frameIndex_tooLarge);
return zs->seekTable.entries[frameIndex + 1].dOffset -
zs->seekTable.entries[frameIndex].dOffset;
}
static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs)
{
int checksumFlag;
ZSTD_seekable_customFile src = zs->src;
/* read the footer, fixed size */
CHECK_IO(src.seek(src.opaque, -(int)ZSTD_seekTableFooterSize, SEEK_END));
CHECK_IO(src.read(src.opaque, zs->inBuff, ZSTD_seekTableFooterSize));
if (MEM_readLE32(zs->inBuff + 5) != ZSTD_SEEKABLE_MAGICNUMBER) {
return ERROR(prefix_unknown);
}
{ BYTE const sfd = zs->inBuff[4];
checksumFlag = sfd >> 7;
/* check reserved bits */
if ((checksumFlag >> 2) & 0x1f) {
return ERROR(corruption_detected);
}
}
{ U32 const numFrames = MEM_readLE32(zs->inBuff);
U32 const sizePerEntry = 8 + (checksumFlag?4:0);
U32 const tableSize = sizePerEntry * numFrames;
U32 const frameSize = tableSize + ZSTD_seekTableFooterSize + ZSTD_skippableHeaderSize;
U32 remaining = frameSize - ZSTD_seekTableFooterSize; /* don't need to re-read footer */
{
U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE);
CHECK_IO(src.seek(src.opaque, -(S64)frameSize, SEEK_END));
CHECK_IO(src.read(src.opaque, zs->inBuff, toRead));
remaining -= toRead;
}
if (MEM_readLE32(zs->inBuff) != (ZSTD_MAGIC_SKIPPABLE_START | 0xE)) {
return ERROR(prefix_unknown);
}
if (MEM_readLE32(zs->inBuff+4) + ZSTD_skippableHeaderSize != frameSize) {
return ERROR(prefix_unknown);
}
{ /* Allocate an extra entry at the end so that we can do size
* computations on the last element without special case */
seekEntry_t* entries = (seekEntry_t*)malloc(sizeof(seekEntry_t) * (numFrames + 1));
const BYTE* tableBase = zs->inBuff + ZSTD_skippableHeaderSize;
U32 idx = 0;
U32 pos = 8;
U64 cOffset = 0;
U64 dOffset = 0;
if (!entries) {
free(entries);
return ERROR(memory_allocation);
}
/* compute cumulative positions */
for (; idx < numFrames; idx++) {
if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) {
U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE);
U32 const offset = SEEKABLE_BUFF_SIZE - pos;
memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */
CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead));
remaining -= toRead;
pos = 0;
}
entries[idx].cOffset = cOffset;
entries[idx].dOffset = dOffset;
cOffset += MEM_readLE32(zs->inBuff + pos);
pos += 4;
dOffset += MEM_readLE32(zs->inBuff + pos);
pos += 4;
if (checksumFlag) {
entries[idx].checksum = MEM_readLE32(zs->inBuff + pos);
pos += 4;
}
}
entries[numFrames].cOffset = cOffset;
entries[numFrames].dOffset = dOffset;
zs->seekTable.entries = entries;
zs->seekTable.tableLen = numFrames;
zs->seekTable.checksumFlag = checksumFlag;
return 0;
}
}
}
size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize)
{
zs->buffWrapper = (buffWrapper_t){src, srcSize, 0};
{ ZSTD_seekable_customFile srcFile = {&zs->buffWrapper,
&ZSTD_seekable_read_buff,
&ZSTD_seekable_seek_buff};
return ZSTD_seekable_initAdvanced(zs, srcFile); }
}
size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src)
{
ZSTD_seekable_customFile srcFile = {src, &ZSTD_seekable_read_FILE,
&ZSTD_seekable_seek_FILE};
return ZSTD_seekable_initAdvanced(zs, srcFile);
}
size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src)
{
zs->src = src;
{ const size_t seekTableInit = ZSTD_seekable_loadSeekTable(zs);
if (ZSTD_isError(seekTableInit)) return seekTableInit; }
zs->decompressedOffset = (U64)-1;
zs->curFrame = (U32)-1;
{ const size_t dstreamInit = ZSTD_initDStream(zs->dstream);
if (ZSTD_isError(dstreamInit)) return dstreamInit; }
return 0;
}
size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, U64 offset)
{
U32 targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, offset);
do {
/* check if we can continue from a previous decompress job */
if (targetFrame != zs->curFrame || offset != zs->decompressedOffset) {
zs->decompressedOffset = zs->seekTable.entries[targetFrame].dOffset;
zs->curFrame = targetFrame;
CHECK_IO(zs->src.seek(zs->src.opaque,
zs->seekTable.entries[targetFrame].cOffset,
SEEK_SET));
zs->in = (ZSTD_inBuffer){zs->inBuff, 0, 0};
XXH64_reset(&zs->xxhState, 0);
ZSTD_resetDStream(zs->dstream);
}
while (zs->decompressedOffset < offset + len) {
size_t toRead;
ZSTD_outBuffer outTmp;
size_t prevOutPos;
if (zs->decompressedOffset < offset) {
/* dummy decompressions until we get to the target offset */
outTmp = (ZSTD_outBuffer){zs->outBuff, MIN(SEEKABLE_BUFF_SIZE, offset - zs->decompressedOffset), 0};
} else {
outTmp = (ZSTD_outBuffer){dst, len, zs->decompressedOffset - offset};
}
prevOutPos = outTmp.pos;
toRead = ZSTD_decompressStream(zs->dstream, &outTmp, &zs->in);
if (ZSTD_isError(toRead)) {
return toRead;
}
if (zs->seekTable.checksumFlag) {
XXH64_update(&zs->xxhState, (BYTE*)outTmp.dst + prevOutPos,
outTmp.pos - prevOutPos);
}
zs->decompressedOffset += outTmp.pos - prevOutPos;
if (toRead == 0) {
/* frame complete */
/* verify checksum */
if (zs->seekTable.checksumFlag &&
(XXH64_digest(&zs->xxhState) & 0xFFFFFFFFU) !=
zs->seekTable.entries[targetFrame].checksum) {
return ERROR(corruption_detected);
}
if (zs->decompressedOffset < offset + len) {
/* go back to the start and force a reset of the stream */
targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, zs->decompressedOffset);
}
break;
}
/* read in more data if we're done with this buffer */
if (zs->in.pos == zs->in.size) {
toRead = MIN(toRead, SEEKABLE_BUFF_SIZE);
CHECK_IO(zs->src.read(zs->src.opaque, zs->inBuff, toRead));
zs->in.size = toRead;
zs->in.pos = 0;
}
}
} while (zs->decompressedOffset != offset + len);
return len;
}
size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, U32 frameIndex)
{
if (frameIndex >= zs->seekTable.tableLen) {
return ERROR(frameIndex_tooLarge);
}
{
size_t const decompressedSize =
zs->seekTable.entries[frameIndex + 1].dOffset -
zs->seekTable.entries[frameIndex].dOffset;
if (dstSize < decompressedSize) {
return ERROR(dstSize_tooSmall);
}
return ZSTD_seekable_decompress(
zs, dst, decompressedSize,
zs->seekTable.entries[frameIndex].dOffset);
}
}

View File

@ -38,6 +38,8 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
case PREFIX(dictionary_wrong): return "Dictionary mismatch";
case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
case PREFIX(maxCode):
default: return notErrorCode;
}

View File

@ -58,6 +58,8 @@ typedef enum {
ZSTD_error_dictionary_corrupted,
ZSTD_error_dictionary_wrong,
ZSTD_error_dictionaryCreation_failed,
ZSTD_error_frameIndex_tooLarge,
ZSTD_error_seekableIO,
ZSTD_error_maxCode
} ZSTD_ErrorCode;