From 8198be8fb6f81e987c3d0a2f334f61914f77c84b Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Mon, 1 Feb 2021 21:22:38 +0100 Subject: Add faster mode with huffman -> code look-up table 2x to 5x speed-up at the cost of ~600B of RAM. Compile with -DDEFLATE_WITH_LUT --- README.md | 69 +++++++++++++++++++++++--------------- src/inflate.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++-- test/compile-c++11.sh | 2 +- test/compile-c++20.sh | 2 +- test/compile-c11.sh | 2 +- test/compile-c99.sh | 2 +- test/test.sh | 20 ++++++++--- 7 files changed, 151 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 1d60782..17c9684 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,14 @@ **zlib-deflate-nostdlib** provides a zlib decompressor (RFC 1950) and deflate -reader (RFC 1951) suitable for 8- and 16-bit microcontrollers. It works -fine on MCUs as small as ATMega328P (used, for example, in the Arduino Nano) -and MSP430FR5994. It is compatible with both C (from c99 on) and C++. Apart -from type definitions for (u)int8\_t, (u)int16\_t, and (u)int32\_t, which are -typically provided by stdint.h, it has no external dependencies. +reader (RFC 1951) suitable for 8- and 16-bit microcontrollers. It works fine on +MCUs as small as ATMega328P (used, for example, in the Arduino Nano) and +MSP430FR5994. It is compatible with both C (from c99 on) and C++. Apart from +type definitions for (u)int8\_t, (u)int16\_t, and (u)int32\_t, which you can +provide yourself if stdint.h is not available, it has no external dependencies. -zlib-deflate-nostdlib is focused on a low memory footprint. It is not optimized -for speed and uses a pretty naive implementation right now. +zlib-deflate-nostdlib is focused on a low memory footprint and not on speed. +Depending on architecture and compilation settings, it requires **1.6 to 2.6 kB +of ROM** and **0.5 to 1.2 kB of RAM**. Decompression speed ranges from **1 to 5 +kB/s per MHz**. See below for details and tunables. Note: This library *inflates* (i.e., decompresses) data. The source files and API are named as such, as is the corresponding function in the original zlib @@ -105,42 +107,55 @@ is designed for. In that case, you are probably better off with ## Memory Requirements -Excluding the decompressed data buffer, zlib-deflate-nostdlib needs about -2.5 kB of ROM and 500 Bytes of RAM. Actual values depend on the architecture, -see the tables below. ROM/RAM values are rounded up to the next multiple of -16B. +Compilation with `-Os`. ROM/RAM values are rounded up to the next multiple of +16B and do not include the buffer for decompressede data. -### default (no checksum verification) +### baseline (no checksum verification) | Architecture | ROM | RAM | | :--- | ---: | ---: | -| 8-bit ATMega328P | 1824 B | 640 B | -| 16-bit MSP430FR5994 | 2272 B | 448 B | -| 20-bit MSP430FR5994 | 2576 B | 464 B | +| 8-bit ATMega328P | 1808 B | 640 B | +| 16-bit MSP430FR5994 | 2256 B | 448 B | +| 20-bit MSP430FR5994 | 2560 B | 464 B | | 32-bit ESP8266 | 1888 B | 656 B | -| 32-bit STM32F446RE (ARM Cortex M3) | 1600 B | 464 B | +| 32-bit STM32F446RE (ARM Cortex M3) | 1616 B | 464 B | ### compliant mode (-DDEFLATE\_CHECKSUM) +ROM = baseline + 150 to 300 B, RAM = baseline. + +### faster mode (-DDEFLATE\_WITH\_LUT) + | Architecture | ROM | RAM | | :--- | ---: | ---: | -| 8-bit ATMega328P | 2032 B | 640 B | -| 16-bit MSP430FR5994 | 2560 B | 448 B | -| 20-bit MSP430FR5994 | 2896 B | 464 B | -| 32-bit ESP8266 | 2048 B | 656 B | -| 32-bit STM32F446RE (ARM Cortex M3) | 1782 B | 464 B | +| 8-bit ATMega328P | — | — | +| 16-bit MSP430FR5994 | 2896 B | 1088 B | +| 20-bit MSP430FR5994 | 3248 B | 1088 B | +| 32-bit ESP8266 | 1856 B | 1296 B | +| 32-bit STM32F446RE (ARM Cortex M3) | 1664 B | 1104 B | + ## Performance -Due to its focus on low RAM usage, zlib-deflate-nostdlib is very slow. Expect -about 1kB/s per MHz on 16-bit and 2kB/s per MHz on 32-bit architectures. Tested -with text files of various sizes, minimum file size 500 bytes, maximum file -size determined by the amount of available RAM. +Tested with text files of various sizes, minimum file size 500 bytes, maximum +file size determined by the amount of available RAM. + +### baseline (no checksum verification) | Architecture | Speed @ 1 MHz | Speed | CPU Clock | | :--- | ---: | ---: | ---: | | 8-bit ATMega328P | 1 kB/s | 10 .. 22 kB/s | 16 MHz | -| 16-bit MSP430FR5994 | 1 kB/s | 8..15 kB/s | 16 MHz | -| 20-bit MSP430FR5994 | 1 kB/s | 8..17 kB/s | 16 MHz | +| 16-bit MSP430FR5994 | 1 kB/s | 8..16 kB/s | 16 MHz | +| 20-bit MSP430FR5994 | 1 kB/s | 8..16 kB/s | 16 MHz | | 32-bit ESP8266 | 1 .. 3 kB/s | 79..246 kB/s | 80 MHz | | 32-bit STM32F446RE (ARM Cortex M3) | 1 .. 5 kB/s | 282..875 kB/s | 168 MHz | + +### faster mode (-DDEFLATE\_WITH\_LUT) + +| Architecture | Speed @ 1 MHz | Speed | CPU Clock | +| :--- | ---: | ---: | ---: | +| 8-bit ATMega328P | — | — | 16 MHz | +| 16-bit MSP430FR5994 | 2 kB/s | 22..37 kB/s | 16 MHz | +| 20-bit MSP430FR5994 | 2 kB/s | 20..34 kB/s | 16 MHz | +| 32-bit ESP8266 | 3 .. 8 kB/s | 234..671 kB/s | 80 MHz | +| 32-bit STM32F446RE (ARM Cortex M3) | 6 .. 17 kB/s | 986..2815 kB/s | 168 MHz | diff --git a/src/inflate.c b/src/inflate.c index 27dcc79..b536689 100644 --- a/src/inflate.c +++ b/src/inflate.c @@ -92,6 +92,11 @@ uint8_t deflate_hc_lengths[19]; */ uint8_t deflate_lld_lengths[318]; +#ifdef DEFLATE_WITH_LUT +uint16_t deflate_ll_codes[288]; +uint16_t deflate_d_codes[30]; +#endif + /* * Bit length counts and next code entries for Literal/Length alphabet. * Combined with the code lengths in deflate_lld_lengths, these make up the @@ -159,8 +164,14 @@ static uint16_t deflate_get_bits(uint8_t num_bits) return ret & deflate_bitmask(num_bits); } +#ifdef DEFLATE_WITH_LUT +static void deflate_build_alphabet(uint8_t * lengths, uint16_t size, + uint8_t * bl_count, uint16_t * next_code, + uint16_t * codes) +#else static void deflate_build_alphabet(uint8_t * lengths, uint16_t size, uint8_t * bl_count, uint16_t * next_code) +#endif { uint16_t i; uint16_t code = 0; @@ -178,12 +189,28 @@ static void deflate_build_alphabet(uint8_t * lengths, uint16_t size, } } - for (i = 1; i < max_len + 1; i++) { + for (i = 1; i <= max_len; i++) { code = (code + bl_count[i - 1]) << 1; next_code[i] = code; } + +#ifdef DEFLATE_WITH_LUT + uint8_t j = 0; + code = 0; + for (j = 1; j <= max_len; j++) { + for (i = 0; i < size; i++) { + if (lengths[i] == j) { + codes[code++] = i; + } + } + } +#endif } +#ifdef DEFLATE_WITH_LUT +static uint16_t deflate_huff(uint16_t * codes, + uint8_t * bl_count, uint16_t * next_code) +#else /* * This function trades speed for low memory requirements. Instead of building * an actual huffman tree (at a cost of about 650 Bytes of RAM), we iterate @@ -192,8 +219,12 @@ static void deflate_build_alphabet(uint8_t * lengths, uint16_t size, */ static uint16_t deflate_huff(uint8_t * lengths, uint16_t size, uint8_t * bl_count, uint16_t * next_code) +#endif { uint16_t next_word = deflate_get_word(); +#ifdef DEFLATE_WITH_LUT + uint16_t code = 0; +#endif for (uint8_t num_bits = 1; num_bits < 16; num_bits++) { uint16_t next_bits = deflate_rev_word(next_word, num_bits); if (bl_count[num_bits] && next_bits >= next_code[num_bits] @@ -203,9 +234,11 @@ static uint16_t deflate_huff(uint8_t * lengths, uint16_t size, deflate_input_now++; deflate_bit_offset -= 8; } +#ifdef DEFLATE_WITH_LUT + return codes[code + (next_bits - next_code[num_bits])]; +#else uint8_t len_pos = next_bits; uint8_t cur_pos = next_code[num_bits]; - // This is slow, but memory-efficient for (uint16_t i = 0; i < size; i++) { if (lengths[i] == num_bits) { if (cur_pos == len_pos) { @@ -214,20 +247,35 @@ static uint16_t deflate_huff(uint8_t * lengths, uint16_t size, cur_pos++; } } +#endif + } else { +#ifdef DEFLATE_WITH_LUT + code += bl_count[num_bits]; +#endif } } return 65535; } +#ifdef DEFLATE_WITH_LUT +static int8_t deflate_huffman(uint16_t * ll_codes, uint16_t * d_codes) +#else static int8_t deflate_huffman(uint8_t * ll_lengths, uint16_t ll_size, uint8_t * d_lengths, uint8_t d_size) +#endif { uint16_t code; uint16_t dcode; while (1) { +#ifdef DEFLATE_WITH_LUT + code = + deflate_huff(ll_codes, deflate_bl_count_ll, + deflate_next_code_ll); +#else code = deflate_huff(ll_lengths, ll_size, deflate_bl_count_ll, deflate_next_code_ll); +#endif if (code < 256) { if (deflate_output_now == deflate_output_end) { return DEFLATE_ERR_OUTPUT_LENGTH; @@ -244,10 +292,17 @@ static int8_t deflate_huffman(uint8_t * ll_lengths, uint16_t ll_size, if (extra_bits) { len_val += deflate_get_bits(extra_bits); } +#ifdef DEFLATE_WITH_LUT + dcode = + deflate_huff(d_codes, + deflate_bl_count_d, + deflate_next_code_d); +#else dcode = deflate_huff(d_lengths, d_size, deflate_bl_count_d, deflate_next_code_d); +#endif uint16_t dist_val = deflate_distance_offsets[dcode]; extra_bits = deflate_distance_bits[dcode]; if (extra_bits) { @@ -313,12 +368,21 @@ static int8_t deflate_static_huffman() deflate_lld_lengths[i] = 5; } +#ifdef DEFLATE_WITH_LUT + deflate_build_alphabet(deflate_lld_lengths, 288, deflate_bl_count_ll, + deflate_next_code_ll, deflate_ll_codes); + deflate_build_alphabet(deflate_lld_lengths + 288, 29, + deflate_bl_count_d, deflate_next_code_d, + deflate_d_codes); + return deflate_huffman(deflate_ll_codes, deflate_d_codes); +#else deflate_build_alphabet(deflate_lld_lengths, 288, deflate_bl_count_ll, deflate_next_code_ll); deflate_build_alphabet(deflate_lld_lengths + 288, 29, deflate_bl_count_d, deflate_next_code_d); return deflate_huffman(deflate_lld_lengths, 288, deflate_lld_lengths + 288, 29); +#endif } static int8_t deflate_dynamic_huffman() @@ -336,16 +400,29 @@ static int8_t deflate_dynamic_huffman() deflate_hc_lengths[deflate_hclen_index[i]] = 0; } +#ifdef DEFLATE_WITH_LUT + deflate_build_alphabet(deflate_hc_lengths, + sizeof(deflate_hc_lengths), + deflate_bl_count_ll, deflate_next_code_ll, + deflate_ll_codes); +#else deflate_build_alphabet(deflate_hc_lengths, sizeof(deflate_hc_lengths), deflate_bl_count_ll, deflate_next_code_ll); +#endif uint16_t items_processed = 0; while (items_processed < hlit + hdist) { +#ifdef DEFLATE_WITH_LUT + uint8_t code = deflate_huff(deflate_ll_codes, + deflate_bl_count_ll, + deflate_next_code_ll); +#else uint8_t code = deflate_huff(deflate_hc_lengths, sizeof(deflate_hc_lengths), deflate_bl_count_ll, deflate_next_code_ll); +#endif if (code == 16) { uint8_t copy_count = 3 + deflate_get_bits(2); for (uint8_t i = 0; i < copy_count; i++) { @@ -371,13 +448,22 @@ static int8_t deflate_dynamic_huffman() } } +#ifdef DEFLATE_WITH_LUT + deflate_build_alphabet(deflate_lld_lengths, hlit, + deflate_bl_count_ll, deflate_next_code_ll, + deflate_ll_codes); + deflate_build_alphabet(deflate_lld_lengths + hlit, hdist, + deflate_bl_count_d, deflate_next_code_d, + deflate_d_codes); + return deflate_huffman(deflate_ll_codes, deflate_d_codes); +#else deflate_build_alphabet(deflate_lld_lengths, hlit, deflate_bl_count_ll, deflate_next_code_ll); deflate_build_alphabet(deflate_lld_lengths + hlit, hdist, deflate_bl_count_d, deflate_next_code_d); - return deflate_huffman(deflate_lld_lengths, hlit, deflate_lld_lengths + hlit, hdist); +#endif } int16_t inflate(unsigned char *input_buf, uint16_t input_len, diff --git a/test/compile-c++11.sh b/test/compile-c++11.sh index cb13625..b3305a5 100755 --- a/test/compile-c++11.sh +++ b/test/compile-c++11.sh @@ -1,3 +1,3 @@ #!/bin/sh -exec g++ -std=c++11 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c +exec g++ -std=c++11 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c diff --git a/test/compile-c++20.sh b/test/compile-c++20.sh index 502063b..aa9f02a 100755 --- a/test/compile-c++20.sh +++ b/test/compile-c++20.sh @@ -1,4 +1,4 @@ #!/bin/sh # g++ as provided by Debian Buster (used for CI tests) does not support c++20 -exec g++ -std=c++2a -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c +exec g++ -std=c++2a -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c diff --git a/test/compile-c11.sh b/test/compile-c11.sh index 87ec632..c9ba1c4 100755 --- a/test/compile-c11.sh +++ b/test/compile-c11.sh @@ -1,3 +1,3 @@ #!/bin/sh -exec gcc -std=c11 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c +exec gcc -std=c11 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c diff --git a/test/compile-c99.sh b/test/compile-c99.sh index 4583ebf..c13617d 100755 --- a/test/compile-c99.sh +++ b/test/compile-c99.sh @@ -1,3 +1,3 @@ #!/bin/sh -exec gcc -std=c99 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c +exec gcc -std=c99 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c diff --git a/test/test.sh b/test/test.sh index 44104d5..60e3e26 100755 --- a/test/test.sh +++ b/test/test.sh @@ -4,10 +4,7 @@ set -eu cd "$(dirname "$0")" -for std in c++11 c++20 c99 c11; do - - "./compile-${std}.sh" - +run_tests() { for file in $(find .. -type f -size -32760c); do if ! ./deflate $file | ./inflate > tmp; then echo "inflate error at $file" @@ -15,6 +12,21 @@ for std in c++11 c++20 c99 c11; do fi diff $file tmp done +} + +for std in c++11 c++20 c99 c11; do + + "./compile-${std}.sh" + run_tests + + "./compile-${std}.sh" -DDEFLATE_CHECKSUM + run_tests + + "./compile-${std}.sh" -DDEFLATE_WITH_LUT + run_tests + + "./compile-${std}.sh" -DDEFLATE_CHECKSUM -DDEFLATE_WITH_LUT + run_tests done -- cgit v1.2.3