summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Friesel <derf@finalrewind.org>2021-02-01 21:22:38 +0100
committerDaniel Friesel <derf@finalrewind.org>2021-02-01 21:22:38 +0100
commit8198be8fb6f81e987c3d0a2f334f61914f77c84b (patch)
treeff47ac7e48ba952c8f9ae7e8194ae3c396f619b2
parente522e5cdb583d20b712c7d9c842fc6ce9d3fe17d (diff)
Add faster mode with huffman -> code look-up tablemain
2x to 5x speed-up at the cost of ~600B of RAM. Compile with -DDEFLATE_WITH_LUT
-rw-r--r--README.md69
-rw-r--r--src/inflate.c92
-rwxr-xr-xtest/compile-c++11.sh2
-rwxr-xr-xtest/compile-c++20.sh2
-rwxr-xr-xtest/compile-c11.sh2
-rwxr-xr-xtest/compile-c99.sh2
-rwxr-xr-xtest/test.sh20
7 files changed, 151 insertions, 38 deletions
diff --git a/README.md b/README.md
index 1d60782..17c9684 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,14 @@
**zlib-deflate-nostdlib** provides a zlib decompressor (RFC 1950) and deflate
-reader (RFC 1951) suitable for 8- and 16-bit microcontrollers. It works
-fine on MCUs as small as ATMega328P (used, for example, in the Arduino Nano)
-and MSP430FR5994. It is compatible with both C (from c99 on) and C++. Apart
-from type definitions for (u)int8\_t, (u)int16\_t, and (u)int32\_t, which are
-typically provided by stdint.h, it has no external dependencies.
+reader (RFC 1951) suitable for 8- and 16-bit microcontrollers. It works fine on
+MCUs as small as ATMega328P (used, for example, in the Arduino Nano) and
+MSP430FR5994. It is compatible with both C (from c99 on) and C++. Apart from
+type definitions for (u)int8\_t, (u)int16\_t, and (u)int32\_t, which you can
+provide yourself if stdint.h is not available, it has no external dependencies.
-zlib-deflate-nostdlib is focused on a low memory footprint. It is not optimized
-for speed and uses a pretty naive implementation right now.
+zlib-deflate-nostdlib is focused on a low memory footprint and not on speed.
+Depending on architecture and compilation settings, it requires **1.6 to 2.6 kB
+of ROM** and **0.5 to 1.2 kB of RAM**. Decompression speed ranges from **1 to 5
+kB/s per MHz**. See below for details and tunables.
Note: This library *inflates* (i.e., decompresses) data. The source files and
API are named as such, as is the corresponding function in the original zlib
@@ -105,42 +107,55 @@ is designed for. In that case, you are probably better off with
## Memory Requirements
-Excluding the decompressed data buffer, zlib-deflate-nostdlib needs about
-2.5 kB of ROM and 500 Bytes of RAM. Actual values depend on the architecture,
-see the tables below. ROM/RAM values are rounded up to the next multiple of
-16B.
+Compilation with `-Os`. ROM/RAM values are rounded up to the next multiple of
+16B and do not include the buffer for decompressede data.
-### default (no checksum verification)
+### baseline (no checksum verification)
| Architecture | ROM | RAM |
| :--- | ---: | ---: |
-| 8-bit ATMega328P | 1824 B | 640 B |
-| 16-bit MSP430FR5994 | 2272 B | 448 B |
-| 20-bit MSP430FR5994 | 2576 B | 464 B |
+| 8-bit ATMega328P | 1808 B | 640 B |
+| 16-bit MSP430FR5994 | 2256 B | 448 B |
+| 20-bit MSP430FR5994 | 2560 B | 464 B |
| 32-bit ESP8266 | 1888 B | 656 B |
-| 32-bit STM32F446RE (ARM Cortex M3) | 1600 B | 464 B |
+| 32-bit STM32F446RE (ARM Cortex M3) | 1616 B | 464 B |
### compliant mode (-DDEFLATE\_CHECKSUM)
+ROM = baseline + 150 to 300 B, RAM = baseline.
+
+### faster mode (-DDEFLATE\_WITH\_LUT)
+
| Architecture | ROM | RAM |
| :--- | ---: | ---: |
-| 8-bit ATMega328P | 2032 B | 640 B |
-| 16-bit MSP430FR5994 | 2560 B | 448 B |
-| 20-bit MSP430FR5994 | 2896 B | 464 B |
-| 32-bit ESP8266 | 2048 B | 656 B |
-| 32-bit STM32F446RE (ARM Cortex M3) | 1782 B | 464 B |
+| 8-bit ATMega328P | — | — |
+| 16-bit MSP430FR5994 | 2896 B | 1088 B |
+| 20-bit MSP430FR5994 | 3248 B | 1088 B |
+| 32-bit ESP8266 | 1856 B | 1296 B |
+| 32-bit STM32F446RE (ARM Cortex M3) | 1664 B | 1104 B |
+
## Performance
-Due to its focus on low RAM usage, zlib-deflate-nostdlib is very slow. Expect
-about 1kB/s per MHz on 16-bit and 2kB/s per MHz on 32-bit architectures. Tested
-with text files of various sizes, minimum file size 500 bytes, maximum file
-size determined by the amount of available RAM.
+Tested with text files of various sizes, minimum file size 500 bytes, maximum
+file size determined by the amount of available RAM.
+
+### baseline (no checksum verification)
| Architecture | Speed @ 1 MHz | Speed | CPU Clock |
| :--- | ---: | ---: | ---: |
| 8-bit ATMega328P | 1 kB/s | 10 .. 22 kB/s | 16 MHz |
-| 16-bit MSP430FR5994 | 1 kB/s | 8..15 kB/s | 16 MHz |
-| 20-bit MSP430FR5994 | 1 kB/s | 8..17 kB/s | 16 MHz |
+| 16-bit MSP430FR5994 | 1 kB/s | 8..16 kB/s | 16 MHz |
+| 20-bit MSP430FR5994 | 1 kB/s | 8..16 kB/s | 16 MHz |
| 32-bit ESP8266 | 1 .. 3 kB/s | 79..246 kB/s | 80 MHz |
| 32-bit STM32F446RE (ARM Cortex M3) | 1 .. 5 kB/s | 282..875 kB/s | 168 MHz |
+
+### faster mode (-DDEFLATE\_WITH\_LUT)
+
+| Architecture | Speed @ 1 MHz | Speed | CPU Clock |
+| :--- | ---: | ---: | ---: |
+| 8-bit ATMega328P | — | — | 16 MHz |
+| 16-bit MSP430FR5994 | 2 kB/s | 22..37 kB/s | 16 MHz |
+| 20-bit MSP430FR5994 | 2 kB/s | 20..34 kB/s | 16 MHz |
+| 32-bit ESP8266 | 3 .. 8 kB/s | 234..671 kB/s | 80 MHz |
+| 32-bit STM32F446RE (ARM Cortex M3) | 6 .. 17 kB/s | 986..2815 kB/s | 168 MHz |
diff --git a/src/inflate.c b/src/inflate.c
index 27dcc79..b536689 100644
--- a/src/inflate.c
+++ b/src/inflate.c
@@ -92,6 +92,11 @@ uint8_t deflate_hc_lengths[19];
*/
uint8_t deflate_lld_lengths[318];
+#ifdef DEFLATE_WITH_LUT
+uint16_t deflate_ll_codes[288];
+uint16_t deflate_d_codes[30];
+#endif
+
/*
* Bit length counts and next code entries for Literal/Length alphabet.
* Combined with the code lengths in deflate_lld_lengths, these make up the
@@ -159,8 +164,14 @@ static uint16_t deflate_get_bits(uint8_t num_bits)
return ret & deflate_bitmask(num_bits);
}
+#ifdef DEFLATE_WITH_LUT
+static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
+ uint8_t * bl_count, uint16_t * next_code,
+ uint16_t * codes)
+#else
static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
uint8_t * bl_count, uint16_t * next_code)
+#endif
{
uint16_t i;
uint16_t code = 0;
@@ -178,12 +189,28 @@ static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
}
}
- for (i = 1; i < max_len + 1; i++) {
+ for (i = 1; i <= max_len; i++) {
code = (code + bl_count[i - 1]) << 1;
next_code[i] = code;
}
+
+#ifdef DEFLATE_WITH_LUT
+ uint8_t j = 0;
+ code = 0;
+ for (j = 1; j <= max_len; j++) {
+ for (i = 0; i < size; i++) {
+ if (lengths[i] == j) {
+ codes[code++] = i;
+ }
+ }
+ }
+#endif
}
+#ifdef DEFLATE_WITH_LUT
+static uint16_t deflate_huff(uint16_t * codes,
+ uint8_t * bl_count, uint16_t * next_code)
+#else
/*
* This function trades speed for low memory requirements. Instead of building
* an actual huffman tree (at a cost of about 650 Bytes of RAM), we iterate
@@ -192,8 +219,12 @@ static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
*/
static uint16_t deflate_huff(uint8_t * lengths, uint16_t size,
uint8_t * bl_count, uint16_t * next_code)
+#endif
{
uint16_t next_word = deflate_get_word();
+#ifdef DEFLATE_WITH_LUT
+ uint16_t code = 0;
+#endif
for (uint8_t num_bits = 1; num_bits < 16; num_bits++) {
uint16_t next_bits = deflate_rev_word(next_word, num_bits);
if (bl_count[num_bits] && next_bits >= next_code[num_bits]
@@ -203,9 +234,11 @@ static uint16_t deflate_huff(uint8_t * lengths, uint16_t size,
deflate_input_now++;
deflate_bit_offset -= 8;
}
+#ifdef DEFLATE_WITH_LUT
+ return codes[code + (next_bits - next_code[num_bits])];
+#else
uint8_t len_pos = next_bits;
uint8_t cur_pos = next_code[num_bits];
- // This is slow, but memory-efficient
for (uint16_t i = 0; i < size; i++) {
if (lengths[i] == num_bits) {
if (cur_pos == len_pos) {
@@ -214,20 +247,35 @@ static uint16_t deflate_huff(uint8_t * lengths, uint16_t size,
cur_pos++;
}
}
+#endif
+ } else {
+#ifdef DEFLATE_WITH_LUT
+ code += bl_count[num_bits];
+#endif
}
}
return 65535;
}
+#ifdef DEFLATE_WITH_LUT
+static int8_t deflate_huffman(uint16_t * ll_codes, uint16_t * d_codes)
+#else
static int8_t deflate_huffman(uint8_t * ll_lengths, uint16_t ll_size,
uint8_t * d_lengths, uint8_t d_size)
+#endif
{
uint16_t code;
uint16_t dcode;
while (1) {
+#ifdef DEFLATE_WITH_LUT
+ code =
+ deflate_huff(ll_codes, deflate_bl_count_ll,
+ deflate_next_code_ll);
+#else
code =
deflate_huff(ll_lengths, ll_size, deflate_bl_count_ll,
deflate_next_code_ll);
+#endif
if (code < 256) {
if (deflate_output_now == deflate_output_end) {
return DEFLATE_ERR_OUTPUT_LENGTH;
@@ -244,10 +292,17 @@ static int8_t deflate_huffman(uint8_t * ll_lengths, uint16_t ll_size,
if (extra_bits) {
len_val += deflate_get_bits(extra_bits);
}
+#ifdef DEFLATE_WITH_LUT
+ dcode =
+ deflate_huff(d_codes,
+ deflate_bl_count_d,
+ deflate_next_code_d);
+#else
dcode =
deflate_huff(d_lengths, d_size,
deflate_bl_count_d,
deflate_next_code_d);
+#endif
uint16_t dist_val = deflate_distance_offsets[dcode];
extra_bits = deflate_distance_bits[dcode];
if (extra_bits) {
@@ -313,12 +368,21 @@ static int8_t deflate_static_huffman()
deflate_lld_lengths[i] = 5;
}
+#ifdef DEFLATE_WITH_LUT
+ deflate_build_alphabet(deflate_lld_lengths, 288, deflate_bl_count_ll,
+ deflate_next_code_ll, deflate_ll_codes);
+ deflate_build_alphabet(deflate_lld_lengths + 288, 29,
+ deflate_bl_count_d, deflate_next_code_d,
+ deflate_d_codes);
+ return deflate_huffman(deflate_ll_codes, deflate_d_codes);
+#else
deflate_build_alphabet(deflate_lld_lengths, 288, deflate_bl_count_ll,
deflate_next_code_ll);
deflate_build_alphabet(deflate_lld_lengths + 288, 29,
deflate_bl_count_d, deflate_next_code_d);
return deflate_huffman(deflate_lld_lengths, 288,
deflate_lld_lengths + 288, 29);
+#endif
}
static int8_t deflate_dynamic_huffman()
@@ -336,16 +400,29 @@ static int8_t deflate_dynamic_huffman()
deflate_hc_lengths[deflate_hclen_index[i]] = 0;
}
+#ifdef DEFLATE_WITH_LUT
+ deflate_build_alphabet(deflate_hc_lengths,
+ sizeof(deflate_hc_lengths),
+ deflate_bl_count_ll, deflate_next_code_ll,
+ deflate_ll_codes);
+#else
deflate_build_alphabet(deflate_hc_lengths,
sizeof(deflate_hc_lengths),
deflate_bl_count_ll, deflate_next_code_ll);
+#endif
uint16_t items_processed = 0;
while (items_processed < hlit + hdist) {
+#ifdef DEFLATE_WITH_LUT
+ uint8_t code = deflate_huff(deflate_ll_codes,
+ deflate_bl_count_ll,
+ deflate_next_code_ll);
+#else
uint8_t code =
deflate_huff(deflate_hc_lengths, sizeof(deflate_hc_lengths),
deflate_bl_count_ll,
deflate_next_code_ll);
+#endif
if (code == 16) {
uint8_t copy_count = 3 + deflate_get_bits(2);
for (uint8_t i = 0; i < copy_count; i++) {
@@ -371,13 +448,22 @@ static int8_t deflate_dynamic_huffman()
}
}
+#ifdef DEFLATE_WITH_LUT
+ deflate_build_alphabet(deflate_lld_lengths, hlit,
+ deflate_bl_count_ll, deflate_next_code_ll,
+ deflate_ll_codes);
+ deflate_build_alphabet(deflate_lld_lengths + hlit, hdist,
+ deflate_bl_count_d, deflate_next_code_d,
+ deflate_d_codes);
+ return deflate_huffman(deflate_ll_codes, deflate_d_codes);
+#else
deflate_build_alphabet(deflate_lld_lengths, hlit,
deflate_bl_count_ll, deflate_next_code_ll);
deflate_build_alphabet(deflate_lld_lengths + hlit, hdist,
deflate_bl_count_d, deflate_next_code_d);
-
return deflate_huffman(deflate_lld_lengths, hlit,
deflate_lld_lengths + hlit, hdist);
+#endif
}
int16_t inflate(unsigned char *input_buf, uint16_t input_len,
diff --git a/test/compile-c++11.sh b/test/compile-c++11.sh
index cb13625..b3305a5 100755
--- a/test/compile-c++11.sh
+++ b/test/compile-c++11.sh
@@ -1,3 +1,3 @@
#!/bin/sh
-exec g++ -std=c++11 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec g++ -std=c++11 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/compile-c++20.sh b/test/compile-c++20.sh
index 502063b..aa9f02a 100755
--- a/test/compile-c++20.sh
+++ b/test/compile-c++20.sh
@@ -1,4 +1,4 @@
#!/bin/sh
# g++ as provided by Debian Buster (used for CI tests) does not support c++20
-exec g++ -std=c++2a -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec g++ -std=c++2a -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/compile-c11.sh b/test/compile-c11.sh
index 87ec632..c9ba1c4 100755
--- a/test/compile-c11.sh
+++ b/test/compile-c11.sh
@@ -1,3 +1,3 @@
#!/bin/sh
-exec gcc -std=c11 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec gcc -std=c11 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/compile-c99.sh b/test/compile-c99.sh
index 4583ebf..c13617d 100755
--- a/test/compile-c99.sh
+++ b/test/compile-c99.sh
@@ -1,3 +1,3 @@
#!/bin/sh
-exec gcc -std=c99 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec gcc -std=c99 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/test.sh b/test/test.sh
index 44104d5..60e3e26 100755
--- a/test/test.sh
+++ b/test/test.sh
@@ -4,10 +4,7 @@ set -eu
cd "$(dirname "$0")"
-for std in c++11 c++20 c99 c11; do
-
- "./compile-${std}.sh"
-
+run_tests() {
for file in $(find .. -type f -size -32760c); do
if ! ./deflate $file | ./inflate > tmp; then
echo "inflate error at $file"
@@ -15,6 +12,21 @@ for std in c++11 c++20 c99 c11; do
fi
diff $file tmp
done
+}
+
+for std in c++11 c++20 c99 c11; do
+
+ "./compile-${std}.sh"
+ run_tests
+
+ "./compile-${std}.sh" -DDEFLATE_CHECKSUM
+ run_tests
+
+ "./compile-${std}.sh" -DDEFLATE_WITH_LUT
+ run_tests
+
+ "./compile-${std}.sh" -DDEFLATE_CHECKSUM -DDEFLATE_WITH_LUT
+ run_tests
done