LCOV - code coverage report
Current view: top level - lib/compression/tests - test_lzx_huffman.c (source / functions) Hit Total Coverage
Test: coverage report for master 2f515e9b Lines: 474 486 97.5 %
Date: 2024-04-21 15:09:00 Functions: 20 20 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Samba compression library - LGPLv3
       3             :  *
       4             :  * Copyright © Catalyst IT 2022
       5             :  *
       6             :  * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
       7             :  *
       8             :  *  ** NOTE! The following LGPL license applies to this file.
       9             :  *  ** It does NOT imply that all of Samba is released under the LGPL
      10             :  *
      11             :  *  This library is free software; you can redistribute it and/or
      12             :  *  modify it under the terms of the GNU Lesser General Public
      13             :  *  License as published by the Free Software Foundation; either
      14             :  *  version 3 of the License, or (at your option) any later version.
      15             :  *
      16             :  *  This library is distributed in the hope that it will be useful,
      17             :  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             :  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      19             :  *  Lesser General Public License for more details.
      20             :  *
      21             :  *  You should have received a copy of the GNU Lesser General Public
      22             :  *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
      23             :  */
      24             : 
      25             : #include <stdarg.h>
      26             : #include <stddef.h>
      27             : #include <setjmp.h>
      28             : #include <cmocka.h>
      29             : #include <stdbool.h>
      30             : #include <sys/stat.h>
      31             : #include "replace.h"
      32             : #include <talloc.h>
      33             : #include "lzxpress_huffman.h"
      34             : #include "lib/util/stable_sort.h"
      35             : #include "lib/util/data_blob.h"
      36             : 
      37             : /* set LZXHUFF_DEBUG_FILES to true to save round-trip files in /tmp. */
      38             : #define LZXHUFF_DEBUG_FILES false
      39             : 
      40             : /* set LZXHUFF_DEBUG_VERBOSE to true to print more. */
      41             : #define LZXHUFF_DEBUG_VERBOSE false
      42             : 
      43             : 
      44             : #if LZXHUFF_DEBUG_VERBOSE
      45             : #define debug_message(...) print_message(__VA_ARGS__)
      46             : 
      47             : #include <time.h>
      48             : 
      49             : struct timespec start = {0};
      50             : struct timespec end = {0};
      51             : static void debug_start_timer(void)
      52             : {
      53             :         clock_gettime(CLOCK_MONOTONIC, &start);
      54             : }
      55             : 
      56             : static void debug_end_timer(const char *name, size_t len)
      57             : {
      58             :         uint64_t ns;
      59             :         double secs;
      60             :         double rate;
      61             :         clock_gettime(CLOCK_MONOTONIC, &end);
      62             :         ns = end.tv_nsec;
      63             :         ns += end.tv_sec * 1000 * 1000 * 1000;
      64             :         ns -= start.tv_nsec;
      65             :         ns -= start.tv_sec * 1000 * 1000 * 1000;
      66             :         secs = ns / 1e9;
      67             :         rate = len / (secs * 1024 * 1024);
      68             :         debug_message("%s %zu bytes in %.2g: \033[1;35m%.2f\033[0m MB per second\n",
      69             :                       name, len, secs, rate);
      70             : }
      71             : 
      72             : #else
      73             : #define debug_message(...) /* debug_message */
      74             : #define debug_start_timer(...) /* debug_start_timer */
      75             : #define debug_end_timer(...) /* debug_end_timer */
      76             : #endif
      77             : 
      78             : 
      79             : struct lzx_pair {
      80             :         const char *name;
      81             :         DATA_BLOB compressed;
      82             :         DATA_BLOB decompressed;
      83             : };
      84             : 
      85             : struct lzx_file_pair {
      86             :         const char *name;
      87             :         const char *compressed_file;
      88             :         const char *decompressed_file;
      89             : };
      90             : 
      91             : 
      92             : #define DECOMP_DIR "testdata/compression/decompressed"
      93             : #define COMP_DIR "testdata/compression/compressed-huffman"
      94             : #define MORE_COMP_DIR "testdata/compression/compressed-more-huffman"
      95             : 
      96             : 
      97             : #define VARRGH(...) __VA_ARGS__
      98             : 
      99             : #define BLOB_FROM_ARRAY(...)                             \
     100             :         {                                                \
     101             :                 .data = (uint8_t[]){__VA_ARGS__},          \
     102             :                 .length = sizeof((uint8_t[]){__VA_ARGS__}) \
     103             :         }
     104             : 
     105             : #define BLOB_FROM_STRING(s)                                      \
     106             :         {                                                            \
     107             :                 .data = discard_const_p(uint8_t, s),                 \
     108             :                 .length = (sizeof(s) - 1)                    \
     109             :         }
     110             : 
     111             : 
     112             : const char * file_names[] = {
     113             :         "27826-8.txt",
     114             :         "5d049b4cb1bd933f5e8ex19",
     115             :         "638e61e96d54279981c3x5",
     116             :         "64k-minus-one-zeros",
     117             :         "64k-plus-one-zeros",
     118             :         "64k-zeros",
     119             :         "96f696a4e5ce56c61a3dx10",
     120             :         "9e0b6a12febf38e98f13",
     121             :         "abc-times-101",
     122             :         "abc-times-105",
     123             :         "abc-times-200",
     124             :         "and_rand",
     125             :         "and_rand-128k+",
     126             :         "b63289ccc7f218c0d56b",
     127             :         "beta-variate1-128k+",
     128             :         "beta-variate2-128k+",
     129             :         "beta-variate3-128k+",
     130             :         "decayed_alphabet_128k+",
     131             :         "decayed_alphabet_64k",
     132             :         "exp_shuffle",
     133             :         "exp_shuffle-128k+",
     134             :         "f00842317dc6d5695b02",
     135             :         "fib_shuffle",
     136             :         "fib_shuffle-128k+",
     137             :         "fuzzing-0fc2d461b56cd8103c91",
     138             :         "fuzzing-17c961778538cc10ab7c",
     139             :         "fuzzing-3591f9dc02bb00a54b60",
     140             :         "fuzzing-3ec3bca27bb9eb40c128",
     141             :         "fuzzing-80b4fa18ff5f8dd04862",
     142             :         "fuzzing-a3115a81d1ac500318f9",
     143             :         "generate-windows-test-vectors.c",
     144             :         "midsummer-nights-dream.txt",
     145             :         "notes-on-the-underground.txt",
     146             :         "pg22009.txt",
     147             :         "repeating",
     148             :         "repeating-exactly-64k",
     149             :         "setup.log",
     150             :         "skewed_choices",
     151             :         "skewed_choices-128k+",
     152             :         /* These ones were deathly slow in fuzzing at one point */
     153             :         "slow-015ddc36a71412ccc50d",
     154             :         "slow-100e9f966a7feb9ca40a",
     155             :         "slow-2a671c3cff4f1574cbab",
     156             :         "slow-33d90a24e70515b14cd0",
     157             :         "slow-49d8c05261e3f412fc72",
     158             :         "slow-50a249d2fe56873e56a0",
     159             :         "slow-63e9f0b52235fb0129fa",
     160             :         "slow-73b7f971d65908ac0095",
     161             :         "slow-8b61e3dd267908544531",
     162             :         "slow-9d1c5a079b0462986f1f",
     163             :         "slow-aa7262a821dabdcf04a6",
     164             :         "slow-b8a91d142b0d2af7f5ca",
     165             :         "slow-c79142457734bbc8d575",
     166             :         "slow-d736544545b90d83fe75",
     167             :         "slow-e3b9bdfaed7d1a606fdb",
     168             :         "slow-f3f1c02a9d006e5e1703",
     169             :         "square_series",
     170             :         "square_series-128k+",
     171             :         "trigram_128k+",
     172             :         "trigram_64k",
     173             :         "trigram_sum_128k+",
     174             :         "trigram_sum_64k",
     175             :         NULL
     176             : };
     177             : 
     178             : struct lzx_pair bidirectional_pairs[] = {
     179             : 
     180             :         {.name = "abc__100_repeats", /* [MS-XCA] 3.2 example 2. */
     181             :          .decompressed = BLOB_FROM_STRING(
     182             :                  "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
     183             :                  "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
     184             :                  "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
     185             :                  "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
     186             :                  "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
     187             :                  ),
     188             :          .compressed = BLOB_FROM_ARRAY(
     189             :                  /*
     190             :                   * The 'a', 'b', and 'c' bytes are 0x61, 0x62, 0x63. No other
     191             :                   * symbols occur. That means we need 48 0x00 bytes for the
     192             :                   * first 96 symbol-nybbles, then some short codes, then zeros
     193             :                   * again for the rest of the literals.
     194             :                   */
     195             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     196             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     197             :                  0,0,0,0,0, 0,0,0,
     198             :                  0x30, 0x23, /* a_ cb */
     199             :                  0,0,0,0,0, 0,0,0,0,0,
     200             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     201             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 100 bytes */
     202             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     203             :                  0,0,0,0,0, 0,0,0, /* here end the 0-255 literals (128 bytes) */
     204             :                  0x02, /* 'EOF' symbol 256 (byte 128 low) */
     205             :                  0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 bytes */
     206             :                  0,0,0,
     207             :                  0x20, /* codepoint 287 (byte 143 high) */
     208             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,         /* 160 bytes */
     209             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     210             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     211             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     212             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 bytes */
     213             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
     214             :                  /*
     215             :                   * So that's the tree.
     216             :                   *
     217             :                   * length 2 codes for 'c', EOF, 287
     218             :                   * length 3 for 'a', 'b'.
     219             :                   *
     220             :                   * c:   00
     221             :                   * EOF: 01
     222             :                   * 287: 10
     223             :                   * a:   110
     224             :                   * b:   111
     225             :                   *
     226             :                   * thus the literal string "abc" is 110-111-00.
     227             :                   *
     228             :                   * Now for the lz77 match definitions for EOF and 287.
     229             :                   *
     230             :                   * Why 287? It encodes the match distance and offset.
     231             :                   *
     232             :                   * 287 - 256 = 31
     233             :                   *
     234             :                   * _length = 31 % 16 = 15
     235             :                   * _distance = 31 / 16 = 1
     236             :                   *
     237             :                   * (it's easier to look at the hex, 0x11f:
     238             :                   * 1xx means a match; x1x is _distance; xxf is _length)
     239             :                   *
     240             :                   * _distance 1 means a two bit distance (10 or 11; 2 or 3).
     241             :                   * That means the next bit will be the least significant bit
     242             :                   * of distance (1 in this case, meaning distance 3).
     243             :                   *
     244             :                   * if _length is 15, real length is included inline.
     245             :                   *
     246             :                   * 'EOF' == 256 means _length = 0, _distance = 0.
     247             :                   *
     248             :                   * _distance 0 means 1, so no further bits needed.
     249             :                   * _length 0 means length 3.
     250             :                   *
     251             :                   * but when used as EOF, this doesn't matter.
     252             :                   */
     253             :                  0xa8, 0xdc, 0x00, 0x00, 0xff, 0x26, 0x01
     254             :                   /* These remaining bytes are:
     255             :                   *
     256             :                   * 10101000 11011100 00000000 00000000 11111111
     257             :                   * 00100110 00000001
     258             :                   *
     259             :                   * and we read them as 16 chunks (i.e. flipping odd/even order)
     260             :                   *
     261             :                   * 110-111-00  10-1-01-000
     262             :                   *   a  b   c 287 | EOF
     263             :                   *                |
     264             :                   *                this is the 287 distance low bit.
     265             :                   *
     266             :                   * The last 3 bits are not used. The 287 length is sort of
     267             :                   * out of band, coming up soon (because 287 encodes length
     268             :                   * 15; most codes do not do this).
     269             :                   *
     270             :                   * 00000000 00000000
     271             :                   *
     272             :                   * This padding is there because the first 32 bits are read
     273             :                   * at the beginning of decoding. If there were more things to
     274             :                   * be encoded, they would be in here.
     275             :                   *
     276             :                   * 11111111
     277             :                   *
     278             :                   * This byte is pulled as the length for the 287 match.
     279             :                   * Because it is 0xff, we pull a further 2 bytes for the
     280             :                   * actual length, i.e. a 16 bit number greater than 270.
     281             :                   *
     282             :                   * 00000001 00100110
     283             :                   *
     284             :                   * that is 0x126 = 294 = the match length - 3 (because we're
     285             :                   * encoding ["abc", <copy from 3 back, 297 chars>, EOF]).
     286             :                   *
     287             :                   */
     288             :                  )
     289             :         },
     290             :         {.name = "abcdefghijklmnopqrstuvwxyz", /* [MS-XCA] 3.2 example 1. */
     291             :          .decompressed = BLOB_FROM_STRING("abcdefghijklmnopqrstuvwxyz"),
     292             :          .compressed = BLOB_FROM_ARRAY(
     293             :                  /*
     294             :                   * In this case there are no matches encoded as there are no
     295             :                   * repeated symbols. Including the EOF, there are 27 symbols
     296             :                   * all occurring exactly as frequently as each other (once).
     297             :                   * From that we would expect the codes to be mostly 5 bits
     298             :                   * long, because 27 < 2^5 (32), but greater than 2^4. And
     299             :                   * that's what we see.
     300             :                   */
     301             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     302             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     303             :                  0,0,0,0,0, 0,0,0,
     304             :                  /* 14 non-zero bytes for 26 letters/nibbles */
     305             :                  0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
     306             :                  0x55, 0x55, 0x55, 0x45, 0x44, 0x04,
     307             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,   /* 80 */
     308             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     309             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     310             :                  0,0,0,0,0, 0,0,0,
     311             :                  0x04,                                 /* 0x100 EOF */
     312             :                  /* no matches */
     313             :                  0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 */
     314             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     315             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     316             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     317             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
     318             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 */
     319             :                  0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
     320             : 
     321             :                  0xd8, 0x52, 0x3e, 0xd7, 0x94, 0x11, 0x5b, 0xe9,
     322             :                  0x19, 0x5f, 0xf9, 0xd6, 0x7c, 0xdf, 0x8d, 0x04,
     323             :                  0x00, 0x00, 0x00, 0x00)
     324             :         },
     325             :         {0}
     326             : };
     327             : 
     328             : 
     329           1 : static void test_lzxpress_huffman_decompress(void **state)
     330             : {
     331           1 :         size_t i;
     332           1 :         ssize_t written;
     333           1 :         uint8_t *dest = NULL;
     334           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     335           4 :         for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
     336           2 :                 struct lzx_pair p = bidirectional_pairs[i];
     337           2 :                 dest = talloc_array(mem_ctx, uint8_t, p.decompressed.length);
     338             : 
     339             :                 debug_message("%s compressed %zu decomp %zu\n", p.name,
     340             :                               p.compressed.length,
     341           2 :                               p.decompressed.length);
     342             : 
     343           2 :                 written = lzxpress_huffman_decompress(p.compressed.data,
     344             :                                                       p.compressed.length,
     345             :                                                       dest,
     346             :                                                       p.decompressed.length);
     347           2 :                 assert_int_not_equal(written, -1);
     348           2 :                 assert_int_equal(written, p.decompressed.length);
     349             : 
     350           2 :                 assert_memory_equal(dest, p.decompressed.data, p.decompressed.length);
     351           2 :                 talloc_free(dest);
     352             :         }
     353           1 : }
     354             : 
     355           1 : static void test_lzxpress_huffman_compress(void **state)
     356             : {
     357           1 :         size_t i;
     358           1 :         ssize_t written;
     359           1 :         uint8_t *dest = NULL;
     360           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     361           4 :         for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
     362           2 :                 struct lzx_pair p = bidirectional_pairs[i];
     363             :                 debug_message("%s compressed %zu decomp %zu\n", p.name,
     364             :                               p.compressed.length,
     365           2 :                               p.decompressed.length);
     366             : 
     367           2 :                 written = lzxpress_huffman_compress_talloc(mem_ctx,
     368             :                                                            p.decompressed.data,
     369             :                                                            p.decompressed.length,
     370             :                                                            &dest);
     371             : 
     372           2 :                 assert_int_not_equal(written, -1);
     373           2 :                 assert_int_equal(written, p.compressed.length);
     374           2 :                 assert_memory_equal(dest, p.compressed.data, p.compressed.length);
     375           2 :                 talloc_free(dest);
     376             :         }
     377           1 : }
     378             : 
     379             : 
     380         367 : static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
     381             :                                     const char *filename)
     382             : {
     383         367 :         DATA_BLOB b = {0};
     384         367 :         FILE *fh = fopen(filename, "rb");
     385         367 :         int ret;
     386         367 :         struct stat s;
     387         367 :         size_t len;
     388         367 :         if (fh == NULL) {
     389          22 :                 debug_message("could not open '%s'\n", filename);
     390          22 :                 return b;
     391             :         }
     392         345 :         ret = fstat(fileno(fh), &s);
     393         345 :         if (ret != 0) {
     394           0 :                 fclose(fh);
     395           0 :                 return b;
     396             :         }
     397         345 :         b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
     398         345 :         if (b.data == NULL) {
     399           0 :                 fclose(fh);
     400           0 :                 return b;
     401             :         }
     402         345 :         len = fread(b.data, 1, s.st_size, fh);
     403         345 :         if (ferror(fh) || len != s.st_size) {
     404           0 :                 TALLOC_FREE(b.data);
     405             :         } else {
     406             :                 b.length = len;
     407             :         }
     408         345 :         fclose(fh);
     409         345 :         return b;
     410             : }
     411             : 
     412             : 
     413             : 
     414           1 : static void test_lzxpress_huffman_decompress_files(void **state)
     415             : {
     416           1 :         size_t i;
     417           1 :         int score = 0;
     418           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     419          63 :         for (i = 0; file_names[i] != NULL; i++) {
     420          61 :                 char filename[200];
     421          61 :                 uint8_t *dest = NULL;
     422          61 :                 ssize_t written;
     423          61 :                 TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
     424          61 :                 struct lzx_pair p = {
     425          61 :                         .name = file_names[i]
     426             :                 };
     427             : 
     428          61 :                 debug_message("%s\n", p.name);
     429             : 
     430          61 :                 snprintf(filename, sizeof(filename),
     431             :                          "%s/%s.decomp", DECOMP_DIR, p.name);
     432             : 
     433          61 :                 p.decompressed = datablob_from_file(tmp_ctx, filename);
     434          61 :                 assert_non_null(p.decompressed.data);
     435             : 
     436          61 :                 snprintf(filename, sizeof(filename),
     437             :                          "%s/%s.lzhuff", COMP_DIR, p.name);
     438             : 
     439          61 :                 p.compressed = datablob_from_file(tmp_ctx, filename);
     440          61 :                 assert_non_null(p.compressed.data);
     441             : 
     442          61 :                 dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
     443          61 :                 debug_start_timer();
     444          61 :                 written = lzxpress_huffman_decompress(p.compressed.data,
     445             :                                                       p.compressed.length,
     446             :                                                       dest,
     447             :                                                       p.decompressed.length);
     448          61 :                 debug_end_timer("decompress", p.decompressed.length);
     449          61 :                 if (written != -1 &&
     450          61 :                     written == p.decompressed.length &&
     451          61 :                     memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
     452          61 :                         debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
     453          61 :                         score++;
     454             :                 } else {
     455             :                         debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
     456          61 :                                       p.name);
     457             :                         debug_message("size %zd vs reference %zu\n",
     458          61 :                                       written, p.decompressed.length);
     459             :                 }
     460          61 :                 talloc_free(tmp_ctx);
     461             :         }
     462           1 :         debug_message("%d/%zu correct\n", score, i);
     463           1 :         assert_int_equal(score, i);
     464           1 : }
     465             : 
     466             : 
     467           1 : static void test_lzxpress_huffman_decompress_more_compressed_files(void **state)
     468             : {
     469             :         /*
     470             :          * This tests the decompression of files that have been compressed on
     471             :          * Windows with the level turned up (to 1, default for MS-XCA is 0).
     472             :          *
     473             :          * The format is identical, but it will have tried harder to find
     474             :          * matches.
     475             :          */
     476           1 :         size_t i;
     477           1 :         int score = 0;
     478           1 :         int found = 0;
     479           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     480          63 :         for (i = 0; file_names[i] != NULL; i++) {
     481          61 :                 char filename[200];
     482          61 :                 uint8_t *dest = NULL;
     483          61 :                 ssize_t written;
     484          61 :                 TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
     485          61 :                 struct lzx_pair p = {
     486          61 :                         .name = file_names[i]
     487             :                 };
     488             : 
     489          61 :                 debug_message("%s\n", p.name);
     490             : 
     491          61 :                 snprintf(filename, sizeof(filename),
     492             :                          "%s/%s.decomp", DECOMP_DIR, p.name);
     493             : 
     494          61 :                 p.decompressed = datablob_from_file(tmp_ctx, filename);
     495          61 :                 assert_non_null(p.decompressed.data);
     496             : 
     497          61 :                 snprintf(filename, sizeof(filename),
     498             :                          "%s/%s.lzhuff", MORE_COMP_DIR, p.name);
     499             : 
     500          61 :                 p.compressed = datablob_from_file(tmp_ctx, filename);
     501          61 :                 if (p.compressed.data == NULL) {
     502             :                         /*
     503             :                          * We don't have all the vectors in the
     504             :                          * more-compressed directory, which is OK, we skip
     505             :                          * them.
     506             :                          */
     507          22 :                         continue;
     508             :                 }
     509          39 :                 found++;
     510          39 :                 dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
     511          39 :                 debug_start_timer();
     512          39 :                 written = lzxpress_huffman_decompress(p.compressed.data,
     513             :                                                       p.compressed.length,
     514             :                                                       dest,
     515             :                                                       p.decompressed.length);
     516          39 :                 debug_end_timer("decompress", p.decompressed.length);
     517          39 :                 if (written == p.decompressed.length &&
     518          39 :                     memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
     519          39 :                         debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
     520          39 :                         score++;
     521             :                 } else {
     522             :                         debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
     523          39 :                                       p.name);
     524             :                         debug_message("size %zd vs reference %zu\n",
     525          39 :                                       written, p.decompressed.length);
     526             :                 }
     527          39 :                 talloc_free(tmp_ctx);
     528             :         }
     529           1 :         debug_message("%d/%d correct\n", score, found);
     530           1 :         assert_int_equal(score, found);
     531           1 : }
     532             : 
     533             : 
     534             : /*
     535             :  * attempt_round_trip() tests whether a data blob can survive a compression
     536             :  * and decompression cycle. If save_name is not NULL and LZXHUFF_DEBUG_FILES
     537             :  * evals to true, the various stages are saved in files with that name and the
     538             :  * '-original', '-compressed', and '-decompressed' suffixes. If ref_compressed
     539             :  * has data, it'll print a message saying whether the compressed data matches
     540             :  * that.
     541             :  */
     542             : 
     543         299 : static ssize_t attempt_round_trip(TALLOC_CTX *mem_ctx,
     544             :                                   DATA_BLOB original,
     545             :                                   const char *save_name,
     546             :                                   DATA_BLOB ref_compressed)
     547             : {
     548         299 :         TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
     549         299 :         DATA_BLOB compressed = data_blob_talloc(tmp_ctx, NULL,
     550             :                                                 original.length * 4 / 3 + 260);
     551         299 :         DATA_BLOB decompressed = data_blob_talloc(tmp_ctx, NULL,
     552             :                                                 original.length);
     553         299 :         ssize_t comp_written, decomp_written;
     554         299 :         debug_start_timer();
     555         299 :         comp_written = lzxpress_huffman_compress_talloc(tmp_ctx,
     556             :                                                         original.data,
     557             :                                                         original.length,
     558             :                                                         &compressed.data);
     559         299 :         debug_end_timer("compress", original.length);
     560         299 :         if (comp_written <= 0) {
     561           0 :                 talloc_free(tmp_ctx);
     562           0 :                 return -1;
     563             :         }
     564             : 
     565         299 :         if (ref_compressed.data != NULL) {
     566             :                 /*
     567             :                  * This is informational, not an assertion; there are
     568             :                  * ~infinite legitimate ways to compress the data, many as
     569             :                  * good as each other (think of compression as a language, not
     570             :                  * a format).
     571             :                  */
     572             :                 debug_message("compressed size %zd vs reference %zu\n",
     573             :                               comp_written, ref_compressed.length);
     574             : 
     575             :                 if (comp_written == compressed.length &&
     576             :                     memcmp(compressed.data, ref_compressed.data, comp_written) == 0) {
     577         299 :                         debug_message("\033[1;32mbyte identical!\033[0m\n");
     578             :                 }
     579             :         }
     580         299 :         debug_start_timer();
     581         299 :         decomp_written = lzxpress_huffman_decompress(compressed.data,
     582             :                                                      comp_written,
     583             :                                                      decompressed.data,
     584             :                                                      original.length);
     585         299 :         debug_end_timer("decompress", original.length);
     586         299 :         if (save_name != NULL && LZXHUFF_DEBUG_FILES) {
     587             :                 char s[300];
     588             :                 FILE *fh = NULL;
     589             : 
     590             :                 snprintf(s, sizeof(s), "%s-original", save_name);
     591             :                 fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
     592             :                 fh = fopen(s, "w");
     593             :                 fwrite(original.data, 1, original.length, fh);
     594             :                 fclose(fh);
     595             : 
     596             :                 snprintf(s, sizeof(s), "%s-compressed", save_name);
     597             :                 fprintf(stderr, "Saving %zu bytes to %s\n", comp_written, s);
     598             :                 fh = fopen(s, "w");
     599             :                 fwrite(compressed.data, 1, comp_written, fh);
     600             :                 fclose(fh);
     601             :                 /*
     602             :                  * We save the decompressed file using original.length, not
     603             :                  * the returned size. If these differ, the returned size will
     604             :                  * be -1. By saving the whole buffer we can see at what point
     605             :                  * it went haywire.
     606             :                  */
     607             :                 snprintf(s, sizeof(s), "%s-decompressed", save_name);
     608             :                 fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
     609             :                 fh = fopen(s, "w");
     610             :                 fwrite(decompressed.data, 1, original.length, fh);
     611             :                 fclose(fh);
     612             :         }
     613             : 
     614         299 :         if (original.length != decomp_written ||
     615         299 :             memcmp(decompressed.data,
     616             :                    original.data,
     617             :                    original.length) != 0) {
     618             :                 debug_message("\033[1;31mgot %zd, expected %zu\033[0m\n",
     619             :                               decomp_written,
     620           0 :                               original.length);
     621           0 :                 talloc_free(tmp_ctx);
     622           0 :                 return -1;
     623             :         }
     624         299 :         talloc_free(tmp_ctx);
     625         299 :         return comp_written;
     626             : }
     627             : 
     628             : 
     629           1 : static void test_lzxpress_huffman_round_trip(void **state)
     630             : {
     631           1 :         size_t i;
     632           1 :         int score = 0;
     633           1 :         ssize_t compressed_total = 0;
     634           1 :         ssize_t reference_total = 0;
     635           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     636          63 :         for (i = 0; file_names[i] != NULL; i++) {
     637          61 :                 char filename[200];
     638          61 :                 char *debug_files = NULL;
     639          61 :                 TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
     640          61 :                 ssize_t comp_size;
     641          61 :                 struct lzx_pair p = {
     642          61 :                         .name = file_names[i]
     643             :                 };
     644          61 :                 debug_message("-------------------\n");
     645          61 :                 debug_message("%s\n", p.name);
     646             : 
     647          61 :                 snprintf(filename, sizeof(filename),
     648             :                          "%s/%s.decomp", DECOMP_DIR, p.name);
     649             : 
     650          61 :                 p.decompressed = datablob_from_file(tmp_ctx, filename);
     651          61 :                 assert_non_null(p.decompressed.data);
     652             : 
     653          61 :                 snprintf(filename, sizeof(filename),
     654             :                          "%s/%s.lzhuff", COMP_DIR, p.name);
     655             : 
     656          61 :                 p.compressed = datablob_from_file(tmp_ctx, filename);
     657          61 :                 if (p.compressed.data == NULL) {
     658             :                         debug_message(
     659             :                                 "Could not load %s reference file %s\n",
     660             :                                 p.name, filename);
     661             :                         debug_message("%s decompressed %zu\n", p.name,
     662             :                                       p.decompressed.length);
     663             :                 } else {
     664             :                         debug_message("%s: reference compressed %zu decomp %zu\n",
     665             :                                       p.name,
     666             :                                       p.compressed.length,
     667          61 :                                       p.decompressed.length);
     668             :                 }
     669          61 :                 if (1) {
     670             :                         /*
     671             :                          * We're going to save copies in /tmp.
     672             :                          */
     673          61 :                         snprintf(filename, sizeof(filename),
     674             :                                  "/tmp/lzxhuffman-%s", p.name);
     675          61 :                         debug_files = filename;
     676             :                 }
     677             : 
     678          61 :                 comp_size = attempt_round_trip(mem_ctx, p.decompressed,
     679             :                                                debug_files,
     680             :                                                p.compressed);
     681          61 :                 if (comp_size > 0) {
     682          61 :                         debug_message("\033[1;32mround trip!\033[0m\n");
     683          61 :                         score++;
     684          61 :                         if (p.compressed.length) {
     685          61 :                                 compressed_total += comp_size;
     686          61 :                                 reference_total += p.compressed.length;
     687             :                         }
     688             :                 }
     689          61 :                 talloc_free(tmp_ctx);
     690             :         }
     691           1 :         debug_message("%d/%zu correct\n", score, i);
     692           1 :         print_message("\033[1;34mtotal compressed size: %zu\033[0m\n",
     693             :                       compressed_total);
     694           1 :         print_message("total reference size:  %zd \n", reference_total);
     695           1 :         print_message("diff:                  %7zd \n",
     696             :                       reference_total - compressed_total);
     697           1 :         assert_true(reference_total != 0);
     698           1 :         print_message("ratio: \033[1;3%dm%.2f\033[0m \n",
     699             :                       2 + (compressed_total >= reference_total),
     700           1 :                       ((double)compressed_total) / reference_total);
     701             :         /*
     702             :          * Assert that the compression is *about* as good as Windows. Of course
     703             :          * it doesn't matter if we do better, but mysteriously getting better
     704             :          * is usually a sign that something is wrong.
     705             :          *
     706             :          * At the time of writing, compressed_total is 2674004, or 10686 more
     707             :          * than the Windows reference total. That's < 0.5% difference, we're
     708             :          * asserting at 2%.
     709             :          */
     710           1 :         assert_true(labs(compressed_total - reference_total) <
     711             :                     compressed_total / 50);
     712             : 
     713           1 :         assert_int_equal(score, i);
     714           1 :         talloc_free(mem_ctx);
     715           1 : }
     716             : 
     717             : /*
     718             :  * Bob Jenkins' Small Fast RNG.
     719             :  *
     720             :  * We don't need it to be this good, but we do need it to be reproduceable
     721             :  * across platforms, which rand() etc aren't.
     722             :  *
     723             :  * http://burtleburtle.net/bob/rand/smallprng.html
     724             :  */
     725             : 
     726             : struct jsf_rng {
     727             :         uint32_t a;
     728             :         uint32_t b;
     729             :         uint32_t c;
     730             :         uint32_t d;
     731             : };
     732             : 
     733             : #define ROTATE32(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
     734             : 
     735    26542191 : static uint32_t jsf32(struct jsf_rng *x) {
     736    26542191 :         uint32_t e = x->a - ROTATE32(x->b, 27);
     737    26542191 :         x->a = x->b ^ ROTATE32(x->c, 17);
     738    26542191 :         x->b = x->c + x->d;
     739    26542191 :         x->c = x->d + e;
     740    26542191 :         x->d = e + x->a;
     741    26542191 :         return x->d;
     742             : }
     743             : 
     744           5 : static void jsf32_init(struct jsf_rng *x, uint32_t seed) {
     745           5 :         size_t i;
     746           5 :         x->a = 0xf1ea5eed;
     747           5 :         x->b = x->c = x->d = seed;
     748         126 :         for (i = 0; i < 20; ++i) {
     749         120 :                 jsf32(x);
     750             :         }
     751             : }
     752             : 
     753             : 
     754           1 : static void test_lzxpress_huffman_long_gpl_round_trip(void **state)
     755             : {
     756             :         /*
     757             :          * We use a kind of model-free Markov model to generate a massively
     758             :          * extended pastiche of the GPLv3 (chosen because it is right there in
     759             :          * "COPYING" and won't change often).
     760             :          *
     761             :          * The point is to check a round trip of a very long message with
     762             :          * multiple repetitions on many scales, without having to add a very
     763             :          * large file.
     764             :          */
     765           1 :         size_t i, j, k;
     766           1 :         uint8_t c;
     767           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     768           1 :         DATA_BLOB gpl = datablob_from_file(mem_ctx, "COPYING");
     769           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
     770           1 :         DATA_BLOB ref = {0};
     771           1 :         ssize_t comp_size;
     772           1 :         struct jsf_rng rng;
     773             : 
     774           1 :         if (gpl.data == NULL) {
     775           0 :                 print_message("could not read COPYING\n");
     776           0 :                 fail();
     777             :         }
     778             : 
     779           1 :         jsf32_init(&rng, 1);
     780             : 
     781           1 :         j = 1;
     782           1 :         original.data[0] = gpl.data[0];
     783     5242880 :         for (i = 1; i < original.length; i++) {
     784     5242879 :                 size_t m;
     785     5242879 :                 char p = original.data[i - 1];
     786     5242879 :                 c = gpl.data[j];
     787     5242879 :                 original.data[i] = c;
     788     5242879 :                 j++;
     789     5242879 :                 m = (j + jsf32(&rng)) % (gpl.length - 50);
     790   147802506 :                 for (k = m; k < m + 30; k++) {
     791   143460910 :                         if (p == gpl.data[k] &&
     792     9104961 :                             c == gpl.data[k + 1]) {
     793      901283 :                                 j = k + 2;
     794      901283 :                                 break;
     795             :                         }
     796             :                 }
     797     5242879 :                 if (j == gpl.length) {
     798          62 :                         j = 1;
     799             :                 }
     800             :         }
     801             : 
     802           1 :         comp_size = attempt_round_trip(mem_ctx, original, "/tmp/gpl", ref);
     803           1 :         assert_true(comp_size > 0);
     804           1 :         assert_true(comp_size < original.length);
     805             : 
     806           1 :         talloc_free(mem_ctx);
     807           1 : }
     808             : 
     809             : 
     810           2 : static void test_lzxpress_huffman_long_random_graph_round_trip(void **state)
     811             : {
     812           2 :         size_t i;
     813           2 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     814           2 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
     815           2 :         DATA_BLOB ref = {0};
     816             :         /*
     817             :          * There's a random trigram graph, with each pair of sequential bytes
     818             :          * pointing to a successor. This would probably fall into a fairly
     819             :          * simple loop, but we introduce damage into the system, randomly
     820             :          * flipping about 1 bit in 64.
     821             :          *
     822             :          * The result is semi-structured and compressible.
     823             :          */
     824           2 :         uint8_t *d = original.data;
     825           2 :         uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
     826           2 :         uint32_t *table32 = (void*)table;
     827           2 :         ssize_t comp_size;
     828           2 :         struct jsf_rng rng;
     829             : 
     830           2 :         jsf32_init(&rng, 1);
     831       32770 :         for (i = 0; i < (65536 / 4); i++) {
     832       32768 :                 table32[i] = jsf32(&rng);
     833             :         }
     834             : 
     835           2 :         d[0] = 'a';
     836           2 :         d[1] = 'b';
     837             : 
     838    10485758 :         for (i = 2; i < original.length; i++) {
     839    10485756 :                 uint16_t k = (d[i - 2] << 8) | d[i - 1];
     840    10485756 :                 uint32_t damage = jsf32(&rng) & jsf32(&rng) & jsf32(&rng);
     841    10485756 :                 damage &= (damage >> 16);
     842    10485756 :                 k ^= damage & 0xffff;
     843    10485756 :                 d[i] = table[k];
     844             :         }
     845             : 
     846           2 :         comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-graph", ref);
     847           2 :         assert_true(comp_size > 0);
     848           2 :         assert_true(comp_size < original.length);
     849             : 
     850           2 :         talloc_free(mem_ctx);
     851           2 : }
     852             : 
     853             : 
     854           1 : static void test_lzxpress_huffman_chaos_graph_round_trip(void **state)
     855             : {
     856           1 :         size_t i;
     857           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     858           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
     859           1 :         DATA_BLOB ref = {0};
     860             :         /*
     861             :          * There's a random trigram graph, with each pair of sequential bytes
     862             :          * pointing to a successor. This would probably fall into a fairly
     863             :          * simple loop, but we keep changing the graph. The result is long
     864             :          * periods of stability separatd by bursts of noise.
     865             :          */
     866           1 :         uint8_t *d = original.data;
     867           1 :         uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
     868           1 :         uint32_t *table32 = (void*)table;
     869           1 :         ssize_t comp_size;
     870           1 :         struct jsf_rng rng;
     871             : 
     872           1 :         jsf32_init(&rng, 1);
     873       16385 :         for (i = 0; i < (65536 / 4); i++) {
     874       16384 :                 table32[i] = jsf32(&rng);
     875             :         }
     876             : 
     877           1 :         d[0] = 'a';
     878           1 :         d[1] = 'b';
     879             : 
     880     5242879 :         for (i = 2; i < original.length; i++) {
     881     5242878 :                 uint16_t k = (d[i - 2] << 8) | d[i - 1];
     882     5242878 :                 uint32_t damage = jsf32(&rng);
     883     5242878 :                 d[i] = table[k];
     884     5242878 :                 if ((damage >> 29) == 0) {
     885      655652 :                         uint16_t index = damage & 0xffff;
     886      655652 :                         uint8_t value = (damage >> 16) & 0xff;
     887      655652 :                         table[index] = value;
     888             :                 }
     889             :         }
     890             : 
     891           1 :         comp_size = attempt_round_trip(mem_ctx, original, "/tmp/chaos-graph", ref);
     892           1 :         assert_true(comp_size > 0);
     893           1 :         assert_true(comp_size < original.length);
     894             : 
     895           1 :         talloc_free(mem_ctx);
     896           1 : }
     897             : 
     898             : 
     899           1 : static void test_lzxpress_huffman_sparse_random_graph_round_trip(void **state)
     900             : {
     901           1 :         size_t i;
     902           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     903           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
     904           1 :         DATA_BLOB ref = {0};
     905             :         /*
     906             :          * There's a random trigram graph, with each pair of sequential bytes
     907             :          * pointing to a successor. This will fall into a fairly simple loops,
     908             :          * but we introduce damage into the system, randomly mangling about 1
     909             :          * byte in 65536.
     910             :          *
     911             :          * The result has very long repetitive runs, which should lead to
     912             :          * oversized blocks.
     913             :          */
     914           1 :         uint8_t *d = original.data;
     915           1 :         uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
     916           1 :         uint32_t *table32 = (void*)table;
     917           1 :         ssize_t comp_size;
     918           1 :         struct jsf_rng rng;
     919             : 
     920           1 :         jsf32_init(&rng, 3);
     921       16385 :         for (i = 0; i < (65536 / 4); i++) {
     922       16384 :                 table32[i] = jsf32(&rng);
     923             :         }
     924             : 
     925           1 :         d[0] = 'a';
     926           1 :         d[1] = 'b';
     927             : 
     928     5242879 :         for (i = 2; i < original.length; i++) {
     929     5242878 :                 uint16_t k = (d[i - 2] << 8) | d[i - 1];
     930     5242878 :                 uint32_t damage = jsf32(&rng);
     931     5242878 :                 if ((damage & 0xffff0000) == 0) {
     932          77 :                         k ^= damage & 0xffff;
     933             :                 }
     934     5242878 :                 d[i] = table[k];
     935             :         }
     936             : 
     937           1 :         comp_size = attempt_round_trip(mem_ctx, original, "/tmp/sparse-random-graph", ref);
     938           1 :         assert_true(comp_size > 0);
     939           1 :         assert_true(comp_size < original.length);
     940             : 
     941           1 :         talloc_free(mem_ctx);
     942           1 : }
     943             : 
     944             : 
     945           1 : static void test_lzxpress_huffman_random_noise_round_trip(void **state)
     946             : {
     947           1 :         size_t i;
     948           1 :         size_t len = 1024 * 1024;
     949           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     950           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
     951           1 :         DATA_BLOB ref = {0};
     952           1 :         ssize_t comp_size;
     953             :         /*
     954             :          * We are filling this up with incompressible noise, but we can assert
     955             :          * quite tight bounds on how badly it will fail to compress.
     956             :          *
     957             :          * Specifically, with randomly distributed codes, the Huffman table
     958             :          * should come out as roughly even, averaging 8 bit codes. Then there
     959             :          * will be a 256 byte table every 64k, which is a 1/256 overhead (i.e.
     960             :          * the compressed length will be 257/256 the original *on average*).
     961             :          * We assert it is less than 1 in 200 but more than 1 in 300.
     962             :          */
     963           1 :         uint32_t *d32 = (uint32_t*)((void*)original.data);
     964           1 :         struct jsf_rng rng;
     965           1 :         jsf32_init(&rng, 2);
     966             : 
     967      262145 :         for (i = 0; i < (len / 4); i++) {
     968      262144 :                 d32[i] = jsf32(&rng);
     969             :         }
     970             : 
     971           1 :         comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-noise", ref);
     972           1 :         assert_true(comp_size > 0);
     973           1 :         assert_true(comp_size > original.length + original.length / 300);
     974           1 :         assert_true(comp_size < original.length + original.length / 200);
     975             :         debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
     976           1 :                       len, comp_size, ((double)comp_size) / len);
     977             : 
     978           1 :         talloc_free(mem_ctx);
     979           1 : }
     980             : 
     981             : 
     982           1 : static void test_lzxpress_huffman_overlong_matches(void **state)
     983             : {
     984           1 :         size_t i, j = 0;
     985           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
     986           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024);
     987           1 :         DATA_BLOB ref = {0};
     988           1 :         uint8_t *d = original.data;
     989           1 :         char filename[300];
     990             :         /*
     991             :          * We are testing with something like "aaaaaaaaaaaaaaaaaaaaaaabbbbb"
     992             :          * where typically the number of "a"s is > 65536, and the number of
     993             :          * "b"s is < 42.
     994             :          */
     995           1 :         ssize_t na[] = {65535, 65536, 65537, 65559, 65575, 200000, -1};
     996           1 :         ssize_t nb[] = {1, 2, 20, 39, 40, 41, 42, -1};
     997           1 :         int score = 0;
     998           1 :         ssize_t comp_size;
     999             : 
    1000           7 :         for (i = 0; na[i] >= 0; i++) {
    1001           6 :                 ssize_t a = na[i];
    1002           6 :                 memset(d, 'a', a);
    1003          48 :                 for (j = 0; nb[j] >= 0; j++) {
    1004          42 :                         ssize_t b = nb[j];
    1005          42 :                         memset(d + a, 'b', b);
    1006          42 :                         original.length = a + b;
    1007          42 :                         snprintf(filename, sizeof(filename),
    1008             :                                  "/tmp/overlong-%zd-%zd", a, b);
    1009          42 :                         comp_size = attempt_round_trip(mem_ctx,
    1010             :                                                        original,
    1011             :                                                        filename, ref);
    1012          42 :                         if (comp_size > 0) {
    1013          42 :                                 score++;
    1014             :                         }
    1015             :                 }
    1016             :         }
    1017           1 :         debug_message("%d/%zu correct\n", score, i * j);
    1018           1 :         assert_int_equal(score, i * j);
    1019           1 :         talloc_free(mem_ctx);
    1020           1 : }
    1021             : 
    1022             : 
    1023           1 : static void test_lzxpress_huffman_overlong_matches_abc(void **state)
    1024             : {
    1025           1 :         size_t i, j = 0, k = 0;
    1026           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
    1027           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024);
    1028           1 :         DATA_BLOB ref = {0};
    1029           1 :         uint8_t *d = original.data;
    1030           1 :         char filename[300];
    1031             :         /*
    1032             :          * We are testing with something like "aaaabbbbcc" where typically
    1033             :          * the number of "a"s + "b"s is around 65536, and the number of "c"s
    1034             :          * is < 43.
    1035             :          */
    1036           1 :         ssize_t nab[] = {1, 21, 32767, 32768, 32769, -1};
    1037           1 :         ssize_t nc[] = {1, 2, 20, 39, 40, 41, 42, -1};
    1038           1 :         int score = 0;
    1039           1 :         ssize_t comp_size;
    1040             : 
    1041           6 :         for (i = 0; nab[i] >= 0; i++) {
    1042           5 :                 ssize_t a = nab[i];
    1043           5 :                 memset(d, 'a', a);
    1044          30 :                 for (j = 0; nab[j] >= 0; j++) {
    1045          25 :                         ssize_t b = nab[j];
    1046          25 :                         memset(d + a, 'b', b);
    1047         200 :                         for (k = 0; nc[k] >= 0; k++) {
    1048         175 :                                 ssize_t c = nc[k];
    1049         175 :                                 memset(d + a + b, 'c', c);
    1050         175 :                                 original.length = a + b + c;
    1051         175 :                                 snprintf(filename, sizeof(filename),
    1052             :                                          "/tmp/overlong-abc-%zd-%zd-%zd",
    1053             :                                          a, b, c);
    1054         175 :                                 comp_size = attempt_round_trip(mem_ctx,
    1055             :                                                                original,
    1056             :                                                                filename, ref);
    1057         175 :                                 if (comp_size > 0) {
    1058         175 :                                         score++;
    1059             :                                 }
    1060             :                         }
    1061             :                 }
    1062             :         }
    1063           1 :         debug_message("%d/%zu correct\n", score, i * j * k);
    1064           1 :         assert_int_equal(score, i * j * k);
    1065           1 :         talloc_free(mem_ctx);
    1066           1 : }
    1067             : 
    1068             : 
    1069           1 : static void test_lzxpress_huffman_extremely_compressible_middle(void **state)
    1070             : {
    1071           1 :         size_t len = 192 * 1024;
    1072           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
    1073           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
    1074           1 :         DATA_BLOB ref = {0};
    1075           1 :         ssize_t comp_size;
    1076             :         /*
    1077             :          * When a middle block (i.e. not the first and not the last of >= 3),
    1078             :          * can be entirely expressed as a match starting in the previous
    1079             :          * block, the Huffman tree would end up with 1 element, which does not
    1080             :          * work for the code construction. It really wants to use both bits.
    1081             :          * So we need to ensure we have some way of dealing with this.
    1082             :          */
    1083           1 :         memset(original.data, 'a', 0x10000 - 1);
    1084           1 :         memset(original.data + 0x10000 - 1, 'b', 0x10000 + 1);
    1085           1 :         memset(original.data + 0x20000, 'a', 0x10000);
    1086           1 :         comp_size = attempt_round_trip(mem_ctx, original, "/tmp/compressible-middle", ref);
    1087           1 :         assert_true(comp_size > 0);
    1088           1 :         assert_true(comp_size < 1024);
    1089             :         debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
    1090           1 :                       len, comp_size, ((double)comp_size) / len);
    1091             : 
    1092           1 :         talloc_free(mem_ctx);
    1093           1 : }
    1094             : 
    1095             : 
    1096           1 : static void test_lzxpress_huffman_max_length_limit(void **state)
    1097             : {
    1098           1 :         size_t len = 65 * 1024 * 1024;
    1099           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
    1100           1 :         DATA_BLOB original = data_blob_talloc_zero(mem_ctx, len);
    1101           1 :         DATA_BLOB ref = {0};
    1102           1 :         ssize_t comp_size;
    1103             :         /*
    1104             :          * Reputedly Windows has a 64MB limit in the maximum match length it
    1105             :          * will encode. We follow this, and test that here with nearly 65 MB
    1106             :          * of zeros between two letters; this should be encoded in three
    1107             :          * blocks:
    1108             :          *
    1109             :          * 1. 'a', 64M × '\0'
    1110             :          * 2. (1M - 2) × '\0' -- finishing off what would have been the same match
    1111             :          * 3. 'b' EOF
    1112             :          *
    1113             :          * Which we can assert by saying the length is > 768, < 1024.
    1114             :          */
    1115           1 :         original.data[0] = 'a';
    1116           1 :         original.data[len - 1] = 'b';
    1117           1 :         comp_size = attempt_round_trip(mem_ctx, original, "/tmp/max-length-limit", ref);
    1118           1 :         assert_true(comp_size > 0x300);
    1119           1 :         assert_true(comp_size < 0x400);
    1120             :         debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
    1121           1 :                       len, comp_size, ((double)comp_size) / len);
    1122             : 
    1123           1 :         talloc_free(mem_ctx);
    1124           1 : }
    1125             : 
    1126             : 
    1127           1 : static void test_lzxpress_huffman_short_boring_strings(void **state)
    1128             : {
    1129           1 :         size_t len = 64 * 1024;
    1130           1 :         TALLOC_CTX *mem_ctx = talloc_new(NULL);
    1131           1 :         DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
    1132           1 :         DATA_BLOB ref = {0};
    1133           1 :         ssize_t comp_size;
    1134           1 :         ssize_t lengths[] = {
    1135             :                 1, 2, 20, 39, 40, 41, 42, 256, 270, 273, 274, 1000, 64000, -1};
    1136           1 :         char filename[300];
    1137           1 :         size_t i;
    1138             :         /*
    1139             :          * How do short repetitive strings work? We're poking at the limit
    1140             :          * around which LZ77 comprssion is turned on.
    1141             :          *
    1142             :          * For this test we don't change the blob memory between runs, just
    1143             :          * the declared length.
    1144             :          */
    1145           1 :         memset(original.data, 'a', len);
    1146          14 :         for (i = 0; lengths[i] >= 0; i++) {
    1147          13 :                 original.length = lengths[i];
    1148          13 :                 snprintf(filename, sizeof(filename),
    1149             :                          "/tmp/short-boring-%zu",
    1150             :                          original.length);
    1151          13 :                 comp_size = attempt_round_trip(mem_ctx, original, filename, ref);
    1152          13 :                 if (original.length < 41) {
    1153           5 :                         assert_true(comp_size > 256 + original.length / 8);
    1154           8 :                 } else if (original.length < 274) {
    1155           5 :                         assert_true(comp_size == 261);
    1156             :                 } else {
    1157           3 :                         assert_true(comp_size == 263);
    1158             :                 }
    1159          13 :                 assert_true(comp_size < 261 + original.length / 8);
    1160             :         }
    1161             :         /* let's just show we didn't change the original */
    1162       65537 :         for (i = 0; i < len; i++) {
    1163       65536 :                 if (original.data[i] != 'a') {
    1164       65536 :                         fail_msg("input data[%zu] was changed! (%2x, expected %2x)\n",
    1165             :                                  i, original.data[i], 'a');
    1166             :                 }
    1167             :         }
    1168             : 
    1169           1 :         talloc_free(mem_ctx);
    1170           1 : }
    1171             : 
    1172             : 
    1173           1 : static void test_lzxpress_huffman_compress_empty_or_null(void **state)
    1174           1 : {
    1175             :         /*
    1176             :          * We expect these to fail with a -1, except the last one, which does
    1177             :          * the real thing.
    1178             :          */
    1179           1 :         ssize_t ret;
    1180           1 :         const uint8_t *input = bidirectional_pairs[0].decompressed.data;
    1181           1 :         size_t ilen = bidirectional_pairs[0].decompressed.length;
    1182           1 :         size_t olen = bidirectional_pairs[0].compressed.length;
    1183           1 :         uint8_t output[olen];
    1184           1 :         struct lzxhuff_compressor_mem cmp_mem;
    1185             : 
    1186           1 :         ret = lzxpress_huffman_compress(&cmp_mem, input, 0, output, olen);
    1187           1 :         assert_int_equal(ret, -1LL);
    1188           1 :         ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, 0);
    1189           1 :         assert_int_equal(ret, -1LL);
    1190             : 
    1191           1 :         ret = lzxpress_huffman_compress(&cmp_mem, NULL, ilen, output, olen);
    1192           1 :         assert_int_equal(ret, -1LL);
    1193           1 :         ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, NULL, olen);
    1194           1 :         assert_int_equal(ret, -1LL);
    1195           1 :         ret = lzxpress_huffman_compress(NULL, input, ilen, output, olen);
    1196           1 :         assert_int_equal(ret, -1LL);
    1197             : 
    1198           1 :         ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, olen);
    1199           1 :         assert_int_equal(ret, olen);
    1200           1 : }
    1201             : 
    1202             : 
    1203           1 : static void test_lzxpress_huffman_decompress_empty_or_null(void **state)
    1204           1 : {
    1205             :         /*
    1206             :          * We expect these to fail with a -1, except the last one.
    1207             :          */
    1208           1 :         ssize_t ret;
    1209           1 :         const uint8_t *input = bidirectional_pairs[0].compressed.data;
    1210           1 :         size_t ilen = bidirectional_pairs[0].compressed.length;
    1211           1 :         size_t olen = bidirectional_pairs[0].decompressed.length;
    1212           1 :         uint8_t output[olen];
    1213             : 
    1214           1 :         ret = lzxpress_huffman_decompress(input, 0, output, olen);
    1215           1 :         assert_int_equal(ret, -1LL);
    1216           1 :         ret = lzxpress_huffman_decompress(input, ilen, output, 0);
    1217           1 :         assert_int_equal(ret, -1LL);
    1218             : 
    1219           1 :         ret = lzxpress_huffman_decompress(NULL, ilen, output, olen);
    1220           1 :         assert_int_equal(ret, -1LL);
    1221           1 :         ret = lzxpress_huffman_decompress(input, ilen, NULL, olen);
    1222           1 :         assert_int_equal(ret, -1LL);
    1223             : 
    1224           1 :         ret = lzxpress_huffman_decompress(input, ilen, output, olen);
    1225           1 :         assert_int_equal(ret, olen);
    1226           1 : }
    1227             : 
    1228             : 
    1229           1 : int main(void) {
    1230           1 :         const struct CMUnitTest tests[] = {
    1231             :                 cmocka_unit_test(test_lzxpress_huffman_short_boring_strings),
    1232             :                 cmocka_unit_test(test_lzxpress_huffman_max_length_limit),
    1233             :                 cmocka_unit_test(test_lzxpress_huffman_extremely_compressible_middle),
    1234             :                 cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip),
    1235             :                 cmocka_unit_test(test_lzxpress_huffman_chaos_graph_round_trip),
    1236             :                 cmocka_unit_test(test_lzxpress_huffman_sparse_random_graph_round_trip),
    1237             :                 cmocka_unit_test(test_lzxpress_huffman_round_trip),
    1238             :                 cmocka_unit_test(test_lzxpress_huffman_decompress_files),
    1239             :                 cmocka_unit_test(test_lzxpress_huffman_decompress_more_compressed_files),
    1240             :                 cmocka_unit_test(test_lzxpress_huffman_compress),
    1241             :                 cmocka_unit_test(test_lzxpress_huffman_decompress),
    1242             :                 cmocka_unit_test(test_lzxpress_huffman_long_gpl_round_trip),
    1243             :                 cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip),
    1244             :                 cmocka_unit_test(test_lzxpress_huffman_random_noise_round_trip),
    1245             :                 cmocka_unit_test(test_lzxpress_huffman_overlong_matches_abc),
    1246             :                 cmocka_unit_test(test_lzxpress_huffman_overlong_matches),
    1247             :                 cmocka_unit_test(test_lzxpress_huffman_decompress_empty_or_null),
    1248             :                 cmocka_unit_test(test_lzxpress_huffman_compress_empty_or_null),
    1249             :         };
    1250           1 :         if (!isatty(1)) {
    1251           1 :                 cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
    1252             :         }
    1253             : 
    1254           1 :         return cmocka_run_group_tests(tests, NULL, NULL);
    1255             : }

Generated by: LCOV version 1.14