[regression] Add dictionary support

Dictionaries are prebuilt and saved as part of the data object. The config decides whether or not to use the dictionary if it is available. Configs that require dictionaries are only run with data that have dictionaries. The method will skip configs that are irrelevant, so for example ZSTD_compress() will skip configs with dictionaries. I've also trimmed the silesia source to 1MB per file (12 MB total), and added 500 samples from the github data set with a dictionary. I've intentionally added an extra line to the `results.csv` to make the nightly build fail, so that we can see how CircleCI reports it. Full list of changes: * Add pre-built dictionaries to the data. * Add `use_dictionary` and `no_pledged_src_size` flags to the config. * Add a config using a dictionary for every level. * Add a config that specifies no pledged source size. * Support dictionaries and streaming in the `zstdcli` method. * Add a context-reuse method using `ZSTD_compressCCtx()`. * Clean up the formatting of the `results.csv` file to align columns. * Add `--data`, `--config`, and `--method` flags to constrain each to a particular value. This is useful for debugging a failure or debugging a particular config/method/data.
2018-11-30 17:16:19 -08:00 · 2018-11-30 17:16:19 -08:00 · e859862341
commit e859862341
parent 386c9ab58a
7 changed files with 657 additions and 226 deletions
--- a/tests/regression/config.c
+++ b/tests/regression/config.c
@ -19,6 +19,12 @@
        .name = "level -" #x,                                       \
        .cli_args = "--fast=" #x,                                   \
        .param_values = PARAM_VALUES(level_fast##x##_param_values), \
+    };                                                              \
+    config_t const level_fast##x##_dict = {                         \
+        .name = "level -" #x " with dict",                          \
+        .cli_args = "--fast=" #x,                                   \
+        .param_values = PARAM_VALUES(level_fast##x##_param_values), \
+        .use_dictionary = 1,                                        \
    };

 /* Define a config for each level we want to test with. */
@ -30,6 +36,12 @@
        .name = "level " #x,                                      \
        .cli_args = "-" #x,                                       \
        .param_values = PARAM_VALUES(level_##x##_param_values),   \
+    };                                                            \
+    config_t const level_##x##_dict = {                           \
+        .name = "level " #x " with dict",                         \
+        .cli_args = "-" #x,                                       \
+        .param_values = PARAM_VALUES(level_##x##_param_values),   \
+        .use_dictionary = 1,                                      \
    };


@ -41,17 +53,31 @@
 #undef LEVEL
 #undef FAST_LEVEL

+static config_t no_pledged_src_size = {
+    .name = "no source size",
+    .cli_args = "",
+    .param_values = {.data = NULL, .size = 0},
+    .no_pledged_src_size = 1,
+};
+
 static config_t const* g_configs[] = {
-#define FAST_LEVEL(x) &level_fast##x,
-#define LEVEL(x) &level_##x,
+
+#define FAST_LEVEL(x) &level_fast##x, &level_fast##x##_dict,
+#define LEVEL(x) &level_##x, &level_##x##_dict,
 #include "levels.h"
 #undef LEVEL
 #undef FAST_LEVEL
+
+    &no_pledged_src_size,
    NULL,
 };

 config_t const* const* configs = g_configs;

+int config_skip_data(config_t const* config, data_t const* data) {
+    return config->use_dictionary && !data_has_dict(data);
+}
+
 int config_get_level(config_t const* config) {
    param_values_t const params = config->param_values;
    size_t i;
--- a/tests/regression/config.h
+++ b/tests/regression/config.h
@ -16,6 +16,8 @@
 #define ZSTD_STATIC_LINKING_ONLY
 #include <zstd.h>

+#include "data.h"
+
 typedef struct {
    ZSTD_cParameter param;
    unsigned value;
@ -41,8 +43,25 @@ typedef struct {
     * the parameters will be derived from these.
     */
    param_values_t param_values;
+    /**
+     * Boolean parameter that says if we should use a dictionary. If the data
+     * doesn't have a dictionary, this config is skipped. Defaults to no.
+     */
+    int use_dictionary;
+    /**
+     * Boolean parameter that says if we should pass the pledged source size
+     * when the method allows it. Defaults to yes.
+     */
+    int no_pledged_src_size;
 } config_t;

+/**
+ * Returns true if the config should skip this data.
+ * For instance, if the config requires a dictionary but the data doesn't have
+ * one.
+ */
+int config_skip_data(config_t const* config, data_t const* data);
+
 #define CONFIG_NO_LEVEL (-ZSTD_TARGETLENGTH_MAX - 1)
 /**
 * Returns the compression level specified by the config, or CONFIG_NO_LEVEL if
--- a/tests/regression/data.c
+++ b/tests/regression/data.c
@ -32,27 +32,58 @@
    "https://github.com/facebook/zstd/releases/download/regression-data/" x

 data_t silesia = {
-    .url = REGRESSION_RELEASE("silesia.tar.zst"),
    .name = "silesia",
    .type = data_type_dir,
-    .xxhash64 = 0x67558ee5506918b4LL,
+    .data =
+        {
+            .url = REGRESSION_RELEASE("silesia.tar.zst"),
+            .xxhash64 = 0x48a199f92f93e977LL,
+        },
 };

 data_t silesia_tar = {
-    .url = REGRESSION_RELEASE("silesia.tar.zst"),
    .name = "silesia.tar",
    .type = data_type_file,
-    .xxhash64 = 0x67558ee5506918b4LL,
+    .data =
+        {
+            .url = REGRESSION_RELEASE("silesia.tar.zst"),
+            .xxhash64 = 0x48a199f92f93e977LL,
+        },
+};
+
+data_t github = {
+    .name = "github",
+    .type = data_type_dir,
+    .data =
+        {
+            .url = REGRESSION_RELEASE("github.tar.zst"),
+            .xxhash64 = 0xa9b1b44b020df292LL,
+        },
+    .dict =
+        {
+            .url = REGRESSION_RELEASE("github.dict.zst"),
+            .xxhash64 = 0x1eddc6f737d3cb53LL,
+
+        },
 };

 static data_t* g_data[] = {
    &silesia,
    &silesia_tar,
+    &github,
    NULL,
 };

 data_t const* const* data = (data_t const* const*)g_data;

+/**
+ * data helpers.
+ */
+
+int data_has_dict(data_t const* data) {
+    return data->dict.url != NULL;
+}
+
 /**
 * data buffer helper functions (documented in header).
 */
@ -100,16 +131,24 @@ err:
    free(buffer.data);
    memset(&buffer, 0, sizeof(buffer));
    return buffer;
-
 }

-data_buffer_t data_buffer_get(data_t const* data) {
+data_buffer_t data_buffer_get_data(data_t const* data) {
    data_buffer_t const kEmptyBuffer = {};

    if (data->type != data_type_file)
        return kEmptyBuffer;

-    return data_buffer_read(data->path);
+    return data_buffer_read(data->data.path);
+}
+
+data_buffer_t data_buffer_get_dict(data_t const* data) {
+    data_buffer_t const kEmptyBuffer = {};
+
+    if (!data_has_dict(data))
+        return kEmptyBuffer;
+
+    return data_buffer_read(data->dict.path);
 }

 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
@ -124,13 +163,69 @@ int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
        return 0;
    assert(buffer1.size > buffer2.size);
    return 1;
-
 }

 void data_buffer_free(data_buffer_t buffer) {
    free(buffer.data);
 }

+/**
+ * data filenames helpers.
+ */
+
+data_filenames_t data_filenames_get(data_t const* data) {
+    data_filenames_t filenames = {.buffer = NULL, .size = 0};
+    char const* path = data->data.path;
+
+    filenames.filenames = UTIL_createFileList(
+        &path,
+        1,
+        &filenames.buffer,
+        &filenames.size,
+        /* followLinks */ 0);
+    return filenames;
+}
+
+void data_filenames_free(data_filenames_t filenames) {
+    UTIL_freeFileList(filenames.filenames, filenames.buffer);
+}
+
+/**
+ * data buffers helpers.
+ */
+
+data_buffers_t data_buffers_get(data_t const* data) {
+    data_buffers_t buffers = {.size = 0};
+    data_filenames_t filenames = data_filenames_get(data);
+    if (filenames.size == 0)
+        return buffers;
+
+    data_buffer_t* buffersPtr =
+        (data_buffer_t*)malloc(filenames.size * sizeof(data_buffer_t));
+    if (buffersPtr == NULL)
+        return buffers;
+    buffers.buffers = (data_buffer_t const*)buffersPtr;
+    buffers.size = filenames.size;
+
+    for (size_t i = 0; i < filenames.size; ++i) {
+        buffersPtr[i] = data_buffer_read(filenames.filenames[i]);
+        if (buffersPtr[i].data == NULL) {
+            data_buffers_t const kEmptyBuffer = {};
+            data_buffers_free(buffers);
+            return kEmptyBuffer;
+        }
+    }
+
+    return buffers;
+}
+
+/**
+ * Frees the data buffers.
+ */
+void data_buffers_free(data_buffers_t buffers) {
+    free((data_buffer_t*)buffers.buffers);
+}
+
 /**
 * Initialization and download functions.
 */
@ -174,18 +269,23 @@ out:
 static char* cat3(char const* str1, char const* str2, char const* str3) {
    size_t const size1 = strlen(str1);
    size_t const size2 = strlen(str2);
-    size_t const size3 = strlen(str3);
+    size_t const size3 = str3 == NULL ? 0 : strlen(str3);
    size_t const size = size1 + size2 + size3 + 1;
    char* const dst = (char*)malloc(size);
    if (dst == NULL)
        return NULL;
    strcpy(dst, str1);
    strcpy(dst + size1, str2);
-    strcpy(dst + size1 + size2, str3);
+    if (str3 != NULL)
+        strcpy(dst + size1 + size2, str3);
    assert(strlen(dst) == size1 + size2 + size3);
    return dst;
 }

+static char* cat2(char const* str1, char const* str2) {
+    return cat3(str1, str2, NULL);
+}
+
 /**
 * State needed by the curl callback.
 * It takes data from curl, hashes it, and writes it to the file.
@ -197,16 +297,18 @@ typedef struct {
 } curl_data_t;

 /** Create the curl state. */
-static curl_data_t curl_data_create(data_t const* data) {
+static curl_data_t curl_data_create(
+    data_resource_t const* resource,
+    data_type_t type) {
    curl_data_t cdata = {};

    XXH64_reset(&cdata.xxhash64, 0);

    assert(UTIL_isDirectory(g_data_dir));

-    if (data->type == data_type_file) {
+    if (type == data_type_file) {
        /* Decompress the resource and store to the path. */
-        char* cmd = cat3("zstd -dqfo '", data->path, "'");
+        char* cmd = cat3("zstd -dqfo '", resource->path, "'");
        if (cmd == NULL) {
            cdata.error = ENOMEM;
            return cdata;
@ -243,54 +345,68 @@ static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
    return written;
 }

-/** Download a single data object. */
-static int curl_download_datum(CURL* curl, data_t const* data) {
-    curl_data_t cdata = curl_data_create(data);
-    int err = EFAULT;
-
-    if (cdata.error != 0) {
-        err = cdata.error;
-        goto out;
-    }
-
+static int curl_download_resource(
+    CURL* curl,
+    data_resource_t const* resource,
+    data_type_t type) {
+    curl_data_t cdata;
    /* Download the data. */
-    if (curl_easy_setopt(curl, CURLOPT_URL, data->url) != 0)
-        goto out;
+    if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
+        return EINVAL;
    if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
-        goto out;
-    if (curl_easy_perform(curl) != 0) {
-        fprintf(stderr, "downloading '%s' failed\n", data->url);
-        goto out;
-    }
-    /* check that the file exists. */
-    if (data->type == data_type_file && !UTIL_isRegularFile(data->path)) {
-        fprintf(stderr, "output file '%s' does not exist\n", data->path);
-        goto out;
-    }
-    if (data->type == data_type_dir && !UTIL_isDirectory(data->path)) {
-        fprintf(stderr, "output directory '%s' does not exist\n", data->path);
-        goto out;
-    }
-    /* Check that the hash matches. */
-    if (XXH64_digest(&cdata.xxhash64) != data->xxhash64) {
+        return EINVAL;
+    cdata = curl_data_create(resource, type);
+    if (cdata.error != 0)
+        return cdata.error;
+    int const curl_err = curl_easy_perform(curl);
+    int const close_err = curl_data_free(cdata);
+    if (curl_err) {
        fprintf(
            stderr,
-            "checksum does not match: %llx != %llx\n",
+            "downloading '%s' for '%s' failed\n",
+            resource->url,
+            resource->path);
+        return EIO;
+    }
+    if (close_err) {
+        fprintf(stderr, "writing data to '%s' failed\n", resource->path);
+        return EIO;
+    }
+    /* check that the file exists. */
+    if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
+        fprintf(stderr, "output file '%s' does not exist\n", resource->path);
+        return EIO;
+    }
+    if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
+        fprintf(
+            stderr, "output directory '%s' does not exist\n", resource->path);
+        return EIO;
+    }
+    /* Check that the hash matches. */
+    if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
+        fprintf(
+            stderr,
+            "checksum does not match: 0x%llxLL != 0x%llxLL\n",
            (unsigned long long)XXH64_digest(&cdata.xxhash64),
-            (unsigned long long)data->xxhash64);
-        goto out;
+            (unsigned long long)resource->xxhash64);
+        return EINVAL;
    }

-    err = 0;
-out:
-    if (err != 0)
-        fprintf(stderr, "downloading '%s' failed\n", data->name);
-    int const close_err = curl_data_free(cdata);
-    if (close_err != 0 && err == 0) {
-        fprintf(stderr, "failed to write data for '%s'\n", data->name);
-        err = close_err;
+    return 0;
+}
+
+/** Download a single data object. */
+static int curl_download_datum(CURL* curl, data_t const* data) {
+    int ret;
+    ret = curl_download_resource(curl, &data->data, data->type);
+    if (ret != 0)
+        return ret;
+    if (data_has_dict(data)) {
+        ret = curl_download_resource(curl, &data->dict, data_type_file);
+        if (ret != 0)
+            return ret;
    }
-    return err;
+    return ret;
 }

 /** Download all the data. */
@ -331,9 +447,14 @@ static int data_create_paths(data_t* const* data, char const* dir) {
    assert(data != NULL);
    for (; *data != NULL; ++data) {
        data_t* const datum = *data;
-        datum->path = cat3(dir, "/", datum->name);
-        if (datum->path == NULL)
+        datum->data.path = cat3(dir, "/", datum->name);
+        if (datum->data.path == NULL)
            return ENOMEM;
+        if (data_has_dict(datum)) {
+            datum->dict.path = cat2(datum->data.path, ".dict");
+            if (datum->dict.path == NULL)
+                return ENOMEM;
+        }
    }
    return 0;
 }
@ -343,8 +464,10 @@ static void data_free_paths(data_t* const* data) {
    assert(data != NULL);
    for (; *data != NULL; ++data) {
        data_t* datum = *data;
-        free((void*)datum->path);
-        datum->path = NULL;
+        free((void*)datum->data.path);
+        free((void*)datum->dict.path);
+        datum->data.path = NULL;
+        datum->dict.path = NULL;
    }
 }

@ -367,7 +490,8 @@ static uint64_t stamp_hash(data_t const* const* data) {
        /* We don't care about the URL that we fetch from. */
        /* The path is derived from the name. */
        XXH64_update(&state, datum->name, strlen(datum->name));
-        xxh_update_le(&state, datum->xxhash64);
+        xxh_update_le(&state, datum->data.xxhash64);
+        xxh_update_le(&state, datum->dict.xxhash64);
        xxh_update_le(&state, datum->type);
    }
    return XXH64_digest(&state);
--- a/tests/regression/data.h
+++ b/tests/regression/data.h
@ -22,10 +22,14 @@ typedef enum {
 typedef struct {
    char const* url;   /**< Where to get this resource. */
    uint64_t xxhash64; /**< Hash of the url contents. */
-    char const* name;  /**< The logical name of the resource (no extension). */
-    data_type_t type;  /**< The type of this resource. */
    char const* path;  /**< The path of the unpacked resource (derived). */
-    size_t size;
+} data_resource_t;
+
+typedef struct {
+    data_resource_t data;
+    data_resource_t dict;
+    data_type_t type;  /**< The type of the data. */
+    char const* name;  /**< The logical name of the data (no extension). */
 } data_t;

 /**
@ -33,6 +37,9 @@ typedef struct {
 */
 extern data_t const* const* data;

+
+int data_has_dict(data_t const* data);
+
 /**
 * Initializes the data module and downloads the data necessary.
 * Caches the downloads in dir. We add a stamp file in the directory after
@ -62,7 +69,14 @@ typedef struct {
 *
 * @returns The buffer, which is NULL on failure.
 */
-data_buffer_t data_buffer_get(data_t const* data);
+data_buffer_t data_buffer_get_data(data_t const* data);
+
+/**
+ * Read the dictionary that the data points to into a buffer.
+ *
+ * @returns The buffer, which is NULL on failure.
+ */
+data_buffer_t data_buffer_get_dict(data_t const* data);

 /**
 * Read the contents of filename into a buffer.
@ -88,5 +102,39 @@ int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2);
 */
 void data_buffer_free(data_buffer_t buffer);

+typedef struct {
+    char* buffer;
+    char const** filenames;
+    unsigned size;
+} data_filenames_t;
+
+/**
+ * Get a recursive list of filenames in the data object. If it is a file, it
+ * will only contain one entry. If it is a directory, it will recursively walk
+ * the directory.
+ *
+ * @returns The list of filenames, which has size 0 and NULL pointers on error.
+ */
+data_filenames_t data_filenames_get(data_t const* data);
+
+/**
+ * Frees the filenames table.
+ */
+void data_filenames_free(data_filenames_t filenames);
+
+typedef struct {
+    data_buffer_t const* buffers;
+    size_t size;
+} data_buffers_t;
+
+/**
+ * @returns a list of buffers for every file in data. It is zero sized on error.
+ */
+data_buffers_t data_buffers_get(data_t const* data);
+
+/**
+ * Frees the data buffers.
+ */
+void data_buffers_free(data_buffers_t buffers);

 #endif
--- a/tests/regression/method.c
+++ b/tests/regression/method.c
@ -26,62 +26,88 @@ void method_set_zstdcli(char const* zstdcli) {
 * the given name, member.
 *
 *     method_state_t* base = ...;
- *     simple_state_t* state = container_of(base, simple_state_t, base);
+ *     buffer_state_t* state = container_of(base, buffer_state_t, base);
 */
 #define container_of(ptr, type, member) \
-    ((type*)(char*)(ptr)-offsetof(type, member))
+    ((type*)(ptr == NULL ? NULL : (char*)(ptr)-offsetof(type, member)))

 /** State to reuse the same buffers between compression calls. */
 typedef struct {
    method_state_t base;
-    data_buffer_t buffer;        /**< The constant input data buffer. */
-    data_buffer_t compressed;    /**< The compressed data buffer. */
-    data_buffer_t decompressed;  /**< The decompressed data buffer. */
-} simple_state_t;
+    data_buffers_t inputs; /**< The input buffer for each file. */
+    data_buffer_t compressed; /**< The compressed data buffer. */
+    data_buffer_t decompressed; /**< The decompressed data buffer. */
+} buffer_state_t;

-static method_state_t* simple_create(data_t const* data) {
-    simple_state_t* state = (simple_state_t*)calloc(1, sizeof(simple_state_t));
+static size_t buffers_max_size(data_buffers_t buffers) {
+    size_t max = 0;
+    for (size_t i = 0; i < buffers.size; ++i) {
+        if (buffers.buffers[i].size > max)
+            max = buffers.buffers[i].size;
+    }
+    return max;
+}
+
+static method_state_t* buffer_state_create(data_t const* data) {
+    buffer_state_t* state = (buffer_state_t*)calloc(1, sizeof(buffer_state_t));
    if (state == NULL)
        return NULL;
    state->base.data = data;
-    state->buffer = data_buffer_get(data);
-    state->compressed =
-        data_buffer_create(ZSTD_compressBound(state->buffer.size));
-    state->decompressed = data_buffer_create(state->buffer.size);
+    state->inputs = data_buffers_get(data);
+    size_t const max_size = buffers_max_size(state->inputs);
+    state->compressed = data_buffer_create(ZSTD_compressBound(max_size));
+    state->decompressed = data_buffer_create(max_size);
    return &state->base;
 }

-static void simple_destroy(method_state_t* base) {
+static void buffer_state_destroy(method_state_t* base) {
    if (base == NULL)
        return;
-    simple_state_t* state = container_of(base, simple_state_t, base);
+    buffer_state_t* state = container_of(base, buffer_state_t, base);
    free(state);
 }

-static result_t simple_compress(method_state_t* base, config_t const* config) {
-    if (base == NULL)
-        return result_error(result_error_system_error);
-    simple_state_t* state = container_of(base, simple_state_t, base);
+static int buffer_state_bad(buffer_state_t const* state) {
+    if (state == NULL) {
+        fprintf(stderr, "buffer_state_t is NULL\n");
+        return 1;
+    }
+    if (state->inputs.size == 0 || state->compressed.data == NULL ||
+        state->decompressed.data == NULL) {
+        fprintf(stderr, "buffer state allocation failure\n");
+        return 1;
+    }
+    return 0;
+}

+static result_t simple_compress(method_state_t* base, config_t const* config) {
+    buffer_state_t* state = container_of(base, buffer_state_t, base);
+
+    if (buffer_state_bad(state))
+        return result_error(result_error_system_error);
+
+    /* Keep the tests short by skipping directories, since behavior shouldn't
+     * change.
+     */
    if (base->data->type != data_type_file)
        return result_error(result_error_skip);

-    if (state->buffer.data == NULL || state->compressed.data == NULL ||
-        state->decompressed.data == NULL) {
-        return result_error(result_error_system_error);
-    }
+    if (config->use_dictionary || config->no_pledged_src_size)
+        return result_error(result_error_skip);

    /* If the config doesn't specify a level, skip. */
    int const level = config_get_level(config);
    if (level == CONFIG_NO_LEVEL)
        return result_error(result_error_skip);

+    data_buffer_t const input = state->inputs.buffers[0];
+
    /* Compress, decompress, and check the result. */
    state->compressed.size = ZSTD_compress(
        state->compressed.data,
        state->compressed.capacity,
-        state->buffer.data,
-        state->buffer.size,
+        input.data,
+        input.size,
        level);
    if (ZSTD_isError(state->compressed.size))
        return result_error(result_error_compression_error);
@ -93,7 +119,7 @@ static result_t simple_compress(method_state_t* base, config_t const* config) {
        state->compressed.size);
    if (ZSTD_isError(state->decompressed.size))
        return result_error(result_error_decompression_error);
-    if (data_buffer_compare(state->buffer, state->decompressed))
+    if (data_buffer_compare(input, state->decompressed))
        return result_error(result_error_round_trip_error);

    result_data_t data;
@ -101,6 +127,70 @@ static result_t simple_compress(method_state_t* base, config_t const* config) {
    return result_data(data);
 }

+static result_t compress_cctx_compress(
+    method_state_t* base,
+    config_t const* config) {
+    buffer_state_t* state = container_of(base, buffer_state_t, base);
+
+    if (buffer_state_bad(state))
+        return result_error(result_error_system_error);
+
+    if (config->use_dictionary || config->no_pledged_src_size)
+        return result_error(result_error_skip);
+
+    if (base->data->type != data_type_dir)
+        return result_error(result_error_skip);
+
+    int const level = config_get_level(config);
+    if (level == CONFIG_NO_LEVEL)
+        return result_error(result_error_skip);
+
+    ZSTD_CCtx* cctx = ZSTD_createCCtx();
+    if (cctx == NULL) {
+        fprintf(stderr, "ZSTD_createCCtx() failed\n");
+        return result_error(result_error_system_error);
+    }
+
+    result_t result;
+    result_data_t data = {.total_size = 0};
+    for (size_t i = 0; i < state->inputs.size; ++i) {
+        data_buffer_t const input = state->inputs.buffers[i];
+
+        state->compressed.size = ZSTD_compressCCtx(
+            cctx,
+            state->compressed.data,
+            state->compressed.capacity,
+            input.data,
+            input.size,
+            level);
+        if (ZSTD_isError(state->compressed.size)) {
+            result = result_error(result_error_compression_error);
+            goto out;
+        }
+
+        state->decompressed.size = ZSTD_decompress(
+            state->decompressed.data,
+            state->decompressed.capacity,
+            state->compressed.data,
+            state->compressed.size);
+        if (ZSTD_isError(state->decompressed.size)) {
+            result = result_error(result_error_decompression_error);
+            goto out;
+        }
+        if (data_buffer_compare(input, state->decompressed)) {
+            result = result_error(result_error_round_trip_error);
+            goto out;
+        }
+
+        data.total_size += state->compressed.size;
+    }
+
+    result = result_data(data);
+out:
+    ZSTD_freeCCtx(cctx);
+    return result;
+}
+
 /** Generic state creation function. */
 static method_state_t* method_state_create(data_t const* data) {
    method_state_t* state = (method_state_t*)malloc(sizeof(method_state_t));
@ -114,26 +204,32 @@ static void method_state_destroy(method_state_t* state) {
    free(state);
 }

-#define MAX_OUT 32
-
-static result_t cli_file_compress(
+static result_t cli_compress(
    method_state_t* state,
    config_t const* config) {
    if (config->cli_args == NULL)
        return result_error(result_error_skip);

+    /* We don't support no pledged source size with directories. Too slow. */
+    if (state->data->type == data_type_dir && config->no_pledged_src_size)
+        return result_error(result_error_skip);
+
    if (g_zstdcli == NULL)
        return result_error(result_error_system_error);

-    /* '<zstd>' -r <args> '<file/dir>' | wc -c */
+    /* '<zstd>' -cqr <args> [-D '<dict>'] '<file/dir>' */
    char cmd[1024];
    size_t const cmd_size = snprintf(
        cmd,
        sizeof(cmd),
-        "'%s' -cqr %s '%s' | wc -c",
+        "'%s' -cqr %s %s%s%s %s '%s'",
        g_zstdcli,
        config->cli_args,
-        state->data->path);
+        config->use_dictionary ? "-D '" : "",
+        config->use_dictionary ? state->data->dict.path : "",
+        config->use_dictionary ? "'" : "",
+        config->no_pledged_src_size ? "<" : "",
+        state->data->data.path);
    if (cmd_size >= sizeof(cmd)) {
        fprintf(stderr, "command too large: %s\n", cmd);
        return result_error(result_error_system_error);
@ -144,42 +240,48 @@ static result_t cli_file_compress(
        return result_error(result_error_system_error);
    }

-    /* Read the total compressed size. */
-    char out[MAX_OUT + 1];
-    size_t const out_size = fread(out, 1, MAX_OUT, zstd);
-    out[out_size] = '\0';
-    int const zstd_ret = pclose(zstd);
-    if (zstd_ret != 0) {
+    char out[4096];
+    size_t total_size = 0;
+    while (1) {
+        size_t const size = fread(out, 1, sizeof(out), zstd);
+        total_size += size;
+        if (size != sizeof(out))
+            break;
+    }
+    if (ferror(zstd) || pclose(zstd) != 0) {
        fprintf(stderr, "zstd failed with command: %s\n", cmd);
        return result_error(result_error_compression_error);
    }
-    if (out_size == MAX_OUT) {
-        fprintf(stderr, "wc -c produced more bytes than expected: %s\n", out);
-        return result_error(result_error_system_error);
-    }

-    result_data_t data;
-    data.total_size = atoll(out);
+    result_data_t const data = {.total_size = total_size};
    return result_data(data);
 }

 method_t const simple = {
-    .name = "simple",
-    .create = simple_create,
+    .name = "ZSTD_compress",
+    .create = buffer_state_create,
    .compress = simple_compress,
-    .destroy = simple_destroy,
+    .destroy = buffer_state_destroy,
 };

-method_t const cli_file = {
-    .name = "cli file",
+method_t const compress_cctx = {
+    .name = "ZSTD_compressCCtx",
+    .create = buffer_state_create,
+    .compress = compress_cctx_compress,
+    .destroy = buffer_state_destroy,
+};
+
+method_t const cli = {
+    .name = "zstdcli",
    .create = method_state_create,
-    .compress = cli_file_compress,
+    .compress = cli_compress,
    .destroy = method_state_destroy,
 };

 static method_t const* g_methods[] = {
    &simple,
-    &cli_file,
+    &compress_cctx,
+    &cli,
    NULL,
 };

--- a/tests/regression/results.csv
+++ b/tests/regression/results.csv
@ -1,43 +1,101 @@
-Data,	Config,	Method,	Total compressed size
-silesia.tar,	level -5,	simple,	106176430
-silesia.tar,	level -3,	simple,	98476550
-silesia.tar,	level -1,	simple,	87206767
-silesia.tar,	level 0,	simple,	66996953
-silesia.tar,	level 1,	simple,	73658303
-silesia.tar,	level 3,	simple,	66996953
-silesia.tar,	level 4,	simple,	65996020
-silesia.tar,	level 5,	simple,	64421326
-silesia.tar,	level 6,	simple,	62388673
-silesia.tar,	level 7,	simple,	61159525
-silesia.tar,	level 9,	simple,	60214921
-silesia.tar,	level 13,	simple,	58428642
-silesia.tar,	level 16,	simple,	56363759
-silesia.tar,	level 19,	simple,	53274173
-silesia,	level -5,	cli file,	106202112
-silesia,	level -3,	cli file,	98518660
-silesia,	level -1,	cli file,	87226203
-silesia,	level 0,	cli file,	67049190
-silesia,	level 1,	cli file,	73676282
-silesia,	level 3,	cli file,	67049190
-silesia,	level 4,	cli file,	66090040
-silesia,	level 5,	cli file,	64503721
-silesia,	level 6,	cli file,	62446177
-silesia,	level 7,	cli file,	61217029
-silesia,	level 9,	cli file,	60282841
-silesia,	level 13,	cli file,	58480658
-silesia,	level 16,	cli file,	56414170
-silesia,	level 19,	cli file,	53365292
-silesia.tar,	level -5,	cli file,	106250113
-silesia.tar,	level -3,	cli file,	98550747
-silesia.tar,	level -1,	cli file,	87227322
-silesia.tar,	level 0,	cli file,	67111168
-silesia.tar,	level 1,	cli file,	73694374
-silesia.tar,	level 3,	cli file,	67111168
-silesia.tar,	level 4,	cli file,	66154079
-silesia.tar,	level 5,	cli file,	64546998
-silesia.tar,	level 6,	cli file,	62458454
-silesia.tar,	level 7,	cli file,	61231085
-silesia.tar,	level 9,	cli file,	60310313
-silesia.tar,	level 13,	cli file,	58517476
-silesia.tar,	level 16,	cli file,	56448694
-silesia.tar,	level 19,	cli file,	53444920
+Data,               Config,             Method,             Total compressed size
+This line is intentionally added to see how the nightly job reports failures
+silesia.tar,        level -5,           ZSTD_compress,      7160438
+silesia.tar,        level -3,           ZSTD_compress,      6789024
+silesia.tar,        level -1,           ZSTD_compress,      6195462
+silesia.tar,        level 0,            ZSTD_compress,      4875071
+silesia.tar,        level 1,            ZSTD_compress,      5339697
+silesia.tar,        level 3,            ZSTD_compress,      4875071
+silesia.tar,        level 4,            ZSTD_compress,      4813104
+silesia.tar,        level 5,            ZSTD_compress,      4726961
+silesia.tar,        level 6,            ZSTD_compress,      4654401
+silesia.tar,        level 7,            ZSTD_compress,      4591933
+silesia.tar,        level 9,            ZSTD_compress,      4554098
+silesia.tar,        level 13,           ZSTD_compress,      4503496
+silesia.tar,        level 16,           ZSTD_compress,      4387233
+silesia.tar,        level 19,           ZSTD_compress,      4283123
+silesia,            level -5,           ZSTD_compressCCtx,  7152294
+silesia,            level -3,           ZSTD_compressCCtx,  6789969
+silesia,            level -1,           ZSTD_compressCCtx,  6191548
+silesia,            level 0,            ZSTD_compressCCtx,  4862377
+silesia,            level 1,            ZSTD_compressCCtx,  5318036
+silesia,            level 3,            ZSTD_compressCCtx,  4862377
+silesia,            level 4,            ZSTD_compressCCtx,  4800629
+silesia,            level 5,            ZSTD_compressCCtx,  4715005
+silesia,            level 6,            ZSTD_compressCCtx,  4644055
+silesia,            level 7,            ZSTD_compressCCtx,  4581559
+silesia,            level 9,            ZSTD_compressCCtx,  4543862
+silesia,            level 13,           ZSTD_compressCCtx,  4493931
+silesia,            level 16,           ZSTD_compressCCtx,  4381885
+silesia,            level 19,           ZSTD_compressCCtx,  4296899
+github,             level -5,           ZSTD_compressCCtx,  232744
+github,             level -3,           ZSTD_compressCCtx,  220611
+github,             level -1,           ZSTD_compressCCtx,  176575
+github,             level 0,            ZSTD_compressCCtx,  136397
+github,             level 1,            ZSTD_compressCCtx,  143457
+github,             level 3,            ZSTD_compressCCtx,  136397
+github,             level 4,            ZSTD_compressCCtx,  136144
+github,             level 5,            ZSTD_compressCCtx,  135106
+github,             level 6,            ZSTD_compressCCtx,  135108
+github,             level 7,            ZSTD_compressCCtx,  135108
+github,             level 9,            ZSTD_compressCCtx,  135108
+github,             level 13,           ZSTD_compressCCtx,  133741
+github,             level 16,           ZSTD_compressCCtx,  133741
+github,             level 19,           ZSTD_compressCCtx,  133717
+silesia,            level -5,           zstdcli,            7152342
+silesia,            level -3,           zstdcli,            6790021
+silesia,            level -1,           zstdcli,            6191597
+silesia,            level 0,            zstdcli,            4862425
+silesia,            level 1,            zstdcli,            5318084
+silesia,            level 3,            zstdcli,            4862425
+silesia,            level 4,            zstdcli,            4800677
+silesia,            level 5,            zstdcli,            4715053
+silesia,            level 6,            zstdcli,            4644103
+silesia,            level 7,            zstdcli,            4581607
+silesia,            level 9,            zstdcli,            4543910
+silesia,            level 13,           zstdcli,            4493979
+silesia,            level 16,           zstdcli,            4381933
+silesia,            level 19,           zstdcli,            4296947
+silesia.tar,        level -5,           zstdcli,            7159586
+silesia.tar,        level -3,           zstdcli,            6791018
+silesia.tar,        level -1,           zstdcli,            6196283
+silesia.tar,        level 0,            zstdcli,            4876730
+silesia.tar,        level 1,            zstdcli,            5340312
+silesia.tar,        level 3,            zstdcli,            4876730
+silesia.tar,        level 4,            zstdcli,            4817723
+silesia.tar,        level 5,            zstdcli,            4730389
+silesia.tar,        level 6,            zstdcli,            4655708
+silesia.tar,        level 7,            zstdcli,            4593407
+silesia.tar,        level 9,            zstdcli,            4556135
+silesia.tar,        level 13,           zstdcli,            4503500
+silesia.tar,        level 16,           zstdcli,            4387237
+silesia.tar,        level 19,           zstdcli,            4283127
+silesia.tar,        no source size,     zstdcli,            4876726
+github,             level -5,           zstdcli,            234744
+github,             level -5 with dict, zstdcli,            47528
+github,             level -3,           zstdcli,            222611
+github,             level -3 with dict, zstdcli,            46394
+github,             level -1,           zstdcli,            178575
+github,             level -1 with dict, zstdcli,            43401
+github,             level 0,            zstdcli,            138397
+github,             level 0 with dict,  zstdcli,            40316
+github,             level 1,            zstdcli,            145457
+github,             level 1 with dict,  zstdcli,            43242
+github,             level 3,            zstdcli,            138397
+github,             level 3 with dict,  zstdcli,            40316
+github,             level 4,            zstdcli,            138144
+github,             level 4 with dict,  zstdcli,            40292
+github,             level 5,            zstdcli,            137106
+github,             level 5 with dict,  zstdcli,            40938
+github,             level 6,            zstdcli,            137108
+github,             level 6 with dict,  zstdcli,            40632
+github,             level 7,            zstdcli,            137108
+github,             level 7 with dict,  zstdcli,            40766
+github,             level 9,            zstdcli,            137108
+github,             level 9 with dict,  zstdcli,            41326
+github,             level 13,           zstdcli,            135741
+github,             level 13 with dict, zstdcli,            41670
+github,             level 16,           zstdcli,            135741
+github,             level 16 with dict, zstdcli,            39940
+github,             level 19,           zstdcli,            135717
+github,             level 19 with dict, zstdcli,            39576
--- a/tests/regression/test.c
+++ b/tests/regression/test.c
@ -17,10 +17,15 @@
 #include "data.h"
 #include "method.h"

-/** Check if a name contains a comma. */
+static int g_max_name_len = 0;
+
+/** Check if a name contains a comma or is too long. */
 static int is_name_bad(char const* name) {
    if (name == NULL)
        return 1;
+    int const len = strlen(name);
+    if (len > g_max_name_len)
+        g_max_name_len = len;
    for (; *name != '\0'; ++name)
        if (*name == ',')
            return 1;
@ -47,57 +52,6 @@ static int are_names_bad() {
    return 0;
 }

-/** Helper macro to print to stderr and a file. */
-#define tprintf(file, ...)            \
-    do {                              \
-        fprintf(file, __VA_ARGS__);   \
-        fprintf(stderr, __VA_ARGS__); \
-    } while (0)
-/** Helper macro to flush stderr and a file. */
-#define tflush(file)    \
-    do {                \
-        fflush(file);   \
-        fflush(stderr); \
-    } while (0)
-
-/**
- * Run all the regression tests and record the results table to results and
- * stderr progressively.
- */
-static int run_all(FILE* results) {
-    tprintf(results, "Data,\tConfig,\tMethod,\tTotal compressed size\n");
-    for (size_t method = 0; methods[method] != NULL; ++method) {
-        for (size_t datum = 0; data[datum] != NULL; ++datum) {
-            /* Create the state common to all configs */
-            method_state_t* state = methods[method]->create(data[datum]);
-            for (size_t config = 0; configs[config] != NULL; ++config) {
-                /* Print the result for the (method, data, config) tuple. */
-                result_t const result =
-                    methods[method]->compress(state, configs[config]);
-                if (result_is_skip(result))
-                    continue;
-                tprintf(
-                    results,
-                    "%s,\t%s,\t%s,\t",
-                    data[datum]->name,
-                    configs[config]->name,
-                    methods[method]->name);
-                if (result_is_error(result)) {
-                    tprintf(results, "%s\n", result_get_error_string(result));
-                } else {
-                    tprintf(
-                        results,
-                        "%llu\n",
-                        (unsigned long long)result_get_data(result).total_size);
-                }
-                tflush(results);
-            }
-            methods[method]->destroy(state);
-        }
-    }
-    return 0;
-}
-
 /**
 * Option parsing using getopt.
 * When you add a new option update: long_options, long_extras, and
@ -109,6 +63,9 @@ static char const* g_output = NULL;
 static char const* g_diff = NULL;
 static char const* g_cache = NULL;
 static char const* g_zstdcli = NULL;
+static char const* g_config = NULL;
+static char const* g_data = NULL;
+static char const* g_method = NULL;

 typedef enum {
    required_option,
@ -120,19 +77,22 @@ typedef enum {
 * Extra state that we need to keep per-option that we can't store in getopt.
 */
 struct option_extra {
-    int id;               /**< The short option name, used as an id. */
-    char const* help;     /**< The help message. */
+    int id; /**< The short option name, used as an id. */
+    char const* help; /**< The help message. */
    option_type opt_type; /**< The option type: required, optional, or help. */
-    char const** value;   /**< The value to set or NULL if no_argument. */
+    char const** value; /**< The value to set or NULL if no_argument. */
 };

 /** The options. */
 static struct option long_options[] = {
    {"cache", required_argument, NULL, 'c'},
-    {"diff", required_argument, NULL, 'd'},
-    {"help", no_argument, NULL, 'h'},
    {"output", required_argument, NULL, 'o'},
    {"zstd", required_argument, NULL, 'z'},
+    {"config", required_argument, NULL, 128},
+    {"data", required_argument, NULL, 129},
+    {"method", required_argument, NULL, 130},
+    {"diff", required_argument, NULL, 'd'},
+    {"help", no_argument, NULL, 'h'},
 };

 static size_t const nargs = sizeof(long_options) / sizeof(long_options[0]);
@ -140,10 +100,13 @@ static size_t const nargs = sizeof(long_options) / sizeof(long_options[0]);
 /** The extra info for the options. Must be in the same order as the options. */
 static struct option_extra long_extras[] = {
    {'c', "the cache directory", required_option, &g_cache},
-    {'d', "compare the results to this file", optional_option, &g_diff},
-    {'h', "display this message", help_option, NULL},
    {'o', "write the results here", required_option, &g_output},
    {'z', "zstd cli tool", required_option, &g_zstdcli},
+    {128, "use this config", optional_option, &g_config},
+    {129, "use this data", optional_option, &g_data},
+    {130, "use this method", optional_option, &g_method},
+    {'d', "compare the results to this file", optional_option, &g_diff},
+    {'h', "display this message", help_option, NULL},
 };

 /** The short options. Must correspond to the options. */
@ -169,14 +132,24 @@ static void print_help(void) {
    fprintf(stderr, "regression test runner\n");
    size_t const nargs = sizeof(long_options) / sizeof(long_options[0]);
    for (size_t i = 0; i < nargs; ++i) {
-        /* Short / long  - help [option type] */
-        fprintf(
-            stderr,
-            "-%c / --%s \t- %s %s\n",
-            long_options[i].val,
-            long_options[i].name,
-            long_extras[i].help,
-            required_message(long_extras[i].opt_type));
+        if (long_options[i].val < 128) {
+            /* Long / short  - help [option type] */
+            fprintf(
+                stderr,
+                "--%s / -%c \t- %s %s\n",
+                long_options[i].name,
+                long_options[i].val,
+                long_extras[i].help,
+                required_message(long_extras[i].opt_type));
+        } else {
+            /* Short / long  - help [option type] */
+            fprintf(
+                stderr,
+                "--%s      \t- %s %s\n",
+                long_options[i].name,
+                long_extras[i].help,
+                required_message(long_extras[i].opt_type));
+        }
    }
 }

@ -220,8 +193,7 @@ static int parse_args(int argc, char** argv) {
            continue;
        fprintf(
            stderr,
-            "-%c / --%s is a required argument but is not set\n",
-            long_options[i].val,
+            "--%s is a required argument but is not set\n",
            long_options[i].name);
        bad = 1;
    }
@ -234,6 +206,88 @@ static int parse_args(int argc, char** argv) {
    return 0;
 }

+/** Helper macro to print to stderr and a file. */
+#define tprintf(file, ...)            \
+    do {                              \
+        fprintf(file, __VA_ARGS__);   \
+        fprintf(stderr, __VA_ARGS__); \
+    } while (0)
+/** Helper macro to flush stderr and a file. */
+#define tflush(file)    \
+    do {                \
+        fflush(file);   \
+        fflush(stderr); \
+    } while (0)
+
+void tprint_names(
+    FILE* results,
+    char const* data_name,
+    char const* config_name,
+    char const* method_name) {
+    int const data_padding = g_max_name_len - strlen(data_name);
+    int const config_padding = g_max_name_len - strlen(config_name);
+    int const method_padding = g_max_name_len - strlen(method_name);
+
+    tprintf(
+        results,
+        "%s, %*s%s, %*s%s, %*s",
+        data_name,
+        data_padding,
+        "",
+        config_name,
+        config_padding,
+        "",
+        method_name,
+        method_padding,
+        "");
+}
+
+/**
+ * Run all the regression tests and record the results table to results and
+ * stderr progressively.
+ */
+static int run_all(FILE* results) {
+    tprint_names(results, "Data", "Config", "Method");
+    tprintf(results, "Total compressed size\n");
+    for (size_t method = 0; methods[method] != NULL; ++method) {
+        if (g_method != NULL && strcmp(methods[method]->name, g_method))
+            continue;
+        for (size_t datum = 0; data[datum] != NULL; ++datum) {
+            if (g_data != NULL && strcmp(data[datum]->name, g_data))
+                continue;
+            /* Create the state common to all configs */
+            method_state_t* state = methods[method]->create(data[datum]);
+            for (size_t config = 0; configs[config] != NULL; ++config) {
+                if (g_config != NULL && strcmp(configs[config]->name, g_config))
+                    continue;
+                if (config_skip_data(configs[config], data[datum]))
+                    continue;
+                /* Print the result for the (method, data, config) tuple. */
+                result_t const result =
+                    methods[method]->compress(state, configs[config]);
+                if (result_is_skip(result))
+                    continue;
+                tprint_names(
+                    results,
+                    data[datum]->name,
+                    configs[config]->name,
+                    methods[method]->name);
+                if (result_is_error(result)) {
+                    tprintf(results, "%s\n", result_get_error_string(result));
+                } else {
+                    tprintf(
+                        results,
+                        "%llu\n",
+                        (unsigned long long)result_get_data(result).total_size);
+                }
+                tflush(results);
+            }
+            methods[method]->destroy(state);
+        }
+    }
+    return 0;
+}
+
 /** memcmp() the old results file and the new results file. */
 static int diff_results(char const* actual_file, char const* expected_file) {
    data_buffer_t const actual = data_buffer_read(actual_file);