From 8198be8fb6f81e987c3d0a2f334f61914f77c84b Mon Sep 17 00:00:00 2001
From: Daniel Friesel <derf@finalrewind.org>
Date: Mon, 1 Feb 2021 21:22:38 +0100
Subject: Add faster mode with huffman -> code look-up table

2x to 5x speed-up at the cost of ~600B of RAM. Compile with -DDEFLATE_WITH_LUT
---
 README.md             | 69 +++++++++++++++++++++++---------------
 src/inflate.c         | 92 +++++++++++++++++++++++++++++++++++++++++++++++++--
 test/compile-c++11.sh |  2 +-
 test/compile-c++20.sh |  2 +-
 test/compile-c11.sh   |  2 +-
 test/compile-c99.sh   |  2 +-
 test/test.sh          | 20 ++++++++---
 7 files changed, 151 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 1d60782..17c9684 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,14 @@
 **zlib-deflate-nostdlib** provides a zlib decompressor (RFC 1950) and deflate
-reader (RFC 1951) suitable for 8- and 16-bit microcontrollers. It works
-fine on MCUs as small as ATMega328P (used, for example, in the Arduino Nano)
-and MSP430FR5994. It is compatible with both C (from c99 on) and C++.  Apart
-from type definitions for (u)int8\_t, (u)int16\_t, and (u)int32\_t, which are
-typically provided by stdint.h, it has no external dependencies.
+reader (RFC 1951) suitable for 8- and 16-bit microcontrollers. It works fine on
+MCUs as small as ATMega328P (used, for example, in the Arduino Nano) and
+MSP430FR5994. It is compatible with both C (from c99 on) and C++.  Apart from
+type definitions for (u)int8\_t, (u)int16\_t, and (u)int32\_t, which you can
+provide yourself if stdint.h is not available, it has no external dependencies.
 
-zlib-deflate-nostdlib is focused on a low memory footprint. It is not optimized
-for speed and uses a pretty naive implementation right now.
+zlib-deflate-nostdlib is focused on a low memory footprint and not on speed.
+Depending on architecture and compilation settings, it requires **1.6 to 2.6 kB
+of ROM** and **0.5 to 1.2 kB of RAM**. Decompression speed ranges from **1 to 5
+kB/s per MHz**.  See below for details and tunables.
 
 Note: This library *inflates* (i.e., decompresses) data. The source files and
 API are named as such, as is the corresponding function in the original zlib
@@ -105,42 +107,55 @@ is designed for. In that case, you are probably better off with
 
 ## Memory Requirements
 
-Excluding the decompressed data buffer, zlib-deflate-nostdlib needs about
-2.5 kB of ROM and 500 Bytes of RAM. Actual values depend on the architecture,
-see the tables below. ROM/RAM values are rounded up to the next multiple of
-16B.
+Compilation with `-Os`. ROM/RAM values are rounded up to the next multiple of
+16B and do not include the buffer for decompressede data.
 
-### default (no checksum verification)
+### baseline (no checksum verification)
 
 | Architecture | ROM | RAM |
 | :--- | ---: | ---: |
-| 8-bit ATMega328P | 1824 B | 640 B |
-| 16-bit MSP430FR5994 | 2272 B | 448 B |
-| 20-bit MSP430FR5994 | 2576 B | 464 B |
+| 8-bit ATMega328P | 1808 B | 640 B |
+| 16-bit MSP430FR5994 | 2256 B | 448 B |
+| 20-bit MSP430FR5994 | 2560 B | 464 B |
 | 32-bit ESP8266 | 1888 B | 656 B |
-| 32-bit STM32F446RE (ARM Cortex M3) | 1600 B | 464 B |
+| 32-bit STM32F446RE (ARM Cortex M3) | 1616 B | 464 B |
 
 ### compliant mode (-DDEFLATE\_CHECKSUM)
 
+ROM = baseline + 150 to 300 B, RAM = baseline.
+
+### faster mode (-DDEFLATE\_WITH\_LUT)
+
 | Architecture | ROM | RAM |
 | :--- | ---: | ---: |
-| 8-bit ATMega328P | 2032 B | 640 B |
-| 16-bit MSP430FR5994 | 2560 B | 448 B |
-| 20-bit MSP430FR5994 | 2896 B | 464 B |
-| 32-bit ESP8266 | 2048 B | 656 B |
-| 32-bit STM32F446RE (ARM Cortex M3) | 1782 B | 464 B |
+| 8-bit ATMega328P | — | — |
+| 16-bit MSP430FR5994 | 2896 B | 1088 B |
+| 20-bit MSP430FR5994 | 3248 B | 1088 B |
+| 32-bit ESP8266 | 1856 B | 1296 B |
+| 32-bit STM32F446RE (ARM Cortex M3) | 1664 B | 1104 B |
+
 
 ## Performance
 
-Due to its focus on low RAM usage, zlib-deflate-nostdlib is very slow. Expect
-about 1kB/s per MHz on 16-bit and 2kB/s per MHz on 32-bit architectures. Tested
-with text files of various sizes, minimum file size 500 bytes, maximum file
-size determined by the amount of available RAM.
+Tested with text files of various sizes, minimum file size 500 bytes, maximum
+file size determined by the amount of available RAM.
+
+### baseline (no checksum verification)
 
 | Architecture | Speed @ 1 MHz | Speed | CPU Clock |
 | :--- | ---: | ---: | ---: |
 | 8-bit ATMega328P | 1 kB/s | 10 .. 22 kB/s | 16 MHz |
-| 16-bit MSP430FR5994 | 1 kB/s | 8..15 kB/s | 16 MHz |
-| 20-bit MSP430FR5994 | 1 kB/s | 8..17 kB/s | 16 MHz |
+| 16-bit MSP430FR5994 | 1 kB/s | 8..16 kB/s | 16 MHz |
+| 20-bit MSP430FR5994 | 1 kB/s | 8..16 kB/s | 16 MHz |
 | 32-bit ESP8266 | 1 .. 3 kB/s | 79..246 kB/s | 80 MHz |
 | 32-bit STM32F446RE (ARM Cortex M3) | 1 .. 5 kB/s | 282..875 kB/s | 168 MHz |
+
+### faster mode (-DDEFLATE\_WITH\_LUT)
+
+| Architecture | Speed @ 1 MHz | Speed | CPU Clock |
+| :--- | ---: | ---: | ---: |
+| 8-bit ATMega328P | — | — | 16 MHz |
+| 16-bit MSP430FR5994 | 2 kB/s | 22..37 kB/s | 16 MHz |
+| 20-bit MSP430FR5994 | 2 kB/s | 20..34 kB/s | 16 MHz |
+| 32-bit ESP8266 | 3 .. 8 kB/s | 234..671 kB/s | 80 MHz |
+| 32-bit STM32F446RE (ARM Cortex M3) | 6 .. 17 kB/s | 986..2815 kB/s | 168 MHz |
diff --git a/src/inflate.c b/src/inflate.c
index 27dcc79..b536689 100644
--- a/src/inflate.c
+++ b/src/inflate.c
@@ -92,6 +92,11 @@ uint8_t deflate_hc_lengths[19];
  */
 uint8_t deflate_lld_lengths[318];
 
+#ifdef DEFLATE_WITH_LUT
+uint16_t deflate_ll_codes[288];
+uint16_t deflate_d_codes[30];
+#endif
+
 /*
  * Bit length counts and next code entries for Literal/Length alphabet.
  * Combined with the code lengths in deflate_lld_lengths, these make up the
@@ -159,8 +164,14 @@ static uint16_t deflate_get_bits(uint8_t num_bits)
 	return ret & deflate_bitmask(num_bits);
 }
 
+#ifdef DEFLATE_WITH_LUT
+static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
+				   uint8_t * bl_count, uint16_t * next_code,
+				   uint16_t * codes)
+#else
 static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
 				   uint8_t * bl_count, uint16_t * next_code)
+#endif
 {
 	uint16_t i;
 	uint16_t code = 0;
@@ -178,12 +189,28 @@ static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
 		}
 	}
 
-	for (i = 1; i < max_len + 1; i++) {
+	for (i = 1; i <= max_len; i++) {
 		code = (code + bl_count[i - 1]) << 1;
 		next_code[i] = code;
 	}
+
+#ifdef DEFLATE_WITH_LUT
+	uint8_t j = 0;
+	code = 0;
+	for (j = 1; j <= max_len; j++) {
+		for (i = 0; i < size; i++) {
+			if (lengths[i] == j) {
+				codes[code++] = i;
+			}
+		}
+	}
+#endif
 }
 
+#ifdef DEFLATE_WITH_LUT
+static uint16_t deflate_huff(uint16_t * codes,
+			     uint8_t * bl_count, uint16_t * next_code)
+#else
 /*
  * This function trades speed for low memory requirements. Instead of building
  * an actual huffman tree (at a cost of about 650 Bytes of RAM), we iterate
@@ -192,8 +219,12 @@ static void deflate_build_alphabet(uint8_t * lengths, uint16_t size,
  */
 static uint16_t deflate_huff(uint8_t * lengths, uint16_t size,
 			     uint8_t * bl_count, uint16_t * next_code)
+#endif
 {
 	uint16_t next_word = deflate_get_word();
+#ifdef DEFLATE_WITH_LUT
+	uint16_t code = 0;
+#endif
 	for (uint8_t num_bits = 1; num_bits < 16; num_bits++) {
 		uint16_t next_bits = deflate_rev_word(next_word, num_bits);
 		if (bl_count[num_bits] && next_bits >= next_code[num_bits]
@@ -203,9 +234,11 @@ static uint16_t deflate_huff(uint8_t * lengths, uint16_t size,
 				deflate_input_now++;
 				deflate_bit_offset -= 8;
 			}
+#ifdef DEFLATE_WITH_LUT
+			return codes[code + (next_bits - next_code[num_bits])];
+#else
 			uint8_t len_pos = next_bits;
 			uint8_t cur_pos = next_code[num_bits];
-			// This is slow, but memory-efficient
 			for (uint16_t i = 0; i < size; i++) {
 				if (lengths[i] == num_bits) {
 					if (cur_pos == len_pos) {
@@ -214,20 +247,35 @@ static uint16_t deflate_huff(uint8_t * lengths, uint16_t size,
 					cur_pos++;
 				}
 			}
+#endif
+		} else {
+#ifdef DEFLATE_WITH_LUT
+			code += bl_count[num_bits];
+#endif
 		}
 	}
 	return 65535;
 }
 
+#ifdef DEFLATE_WITH_LUT
+static int8_t deflate_huffman(uint16_t * ll_codes, uint16_t * d_codes)
+#else
 static int8_t deflate_huffman(uint8_t * ll_lengths, uint16_t ll_size,
 			      uint8_t * d_lengths, uint8_t d_size)
+#endif
 {
 	uint16_t code;
 	uint16_t dcode;
 	while (1) {
+#ifdef DEFLATE_WITH_LUT
+		code =
+		    deflate_huff(ll_codes, deflate_bl_count_ll,
+				 deflate_next_code_ll);
+#else
 		code =
 		    deflate_huff(ll_lengths, ll_size, deflate_bl_count_ll,
 				 deflate_next_code_ll);
+#endif
 		if (code < 256) {
 			if (deflate_output_now == deflate_output_end) {
 				return DEFLATE_ERR_OUTPUT_LENGTH;
@@ -244,10 +292,17 @@ static int8_t deflate_huffman(uint8_t * ll_lengths, uint16_t ll_size,
 			if (extra_bits) {
 				len_val += deflate_get_bits(extra_bits);
 			}
+#ifdef DEFLATE_WITH_LUT
+			dcode =
+			    deflate_huff(d_codes,
+					 deflate_bl_count_d,
+					 deflate_next_code_d);
+#else
 			dcode =
 			    deflate_huff(d_lengths, d_size,
 					 deflate_bl_count_d,
 					 deflate_next_code_d);
+#endif
 			uint16_t dist_val = deflate_distance_offsets[dcode];
 			extra_bits = deflate_distance_bits[dcode];
 			if (extra_bits) {
@@ -313,12 +368,21 @@ static int8_t deflate_static_huffman()
 		deflate_lld_lengths[i] = 5;
 	}
 
+#ifdef DEFLATE_WITH_LUT
+	deflate_build_alphabet(deflate_lld_lengths, 288, deflate_bl_count_ll,
+			       deflate_next_code_ll, deflate_ll_codes);
+	deflate_build_alphabet(deflate_lld_lengths + 288, 29,
+			       deflate_bl_count_d, deflate_next_code_d,
+			       deflate_d_codes);
+	return deflate_huffman(deflate_ll_codes, deflate_d_codes);
+#else
 	deflate_build_alphabet(deflate_lld_lengths, 288, deflate_bl_count_ll,
 			       deflate_next_code_ll);
 	deflate_build_alphabet(deflate_lld_lengths + 288, 29,
 			       deflate_bl_count_d, deflate_next_code_d);
 	return deflate_huffman(deflate_lld_lengths, 288,
 			       deflate_lld_lengths + 288, 29);
+#endif
 }
 
 static int8_t deflate_dynamic_huffman()
@@ -336,16 +400,29 @@ static int8_t deflate_dynamic_huffman()
 		deflate_hc_lengths[deflate_hclen_index[i]] = 0;
 	}
 
+#ifdef DEFLATE_WITH_LUT
+	deflate_build_alphabet(deflate_hc_lengths,
+			       sizeof(deflate_hc_lengths),
+			       deflate_bl_count_ll, deflate_next_code_ll,
+			       deflate_ll_codes);
+#else
 	deflate_build_alphabet(deflate_hc_lengths,
 			       sizeof(deflate_hc_lengths),
 			       deflate_bl_count_ll, deflate_next_code_ll);
+#endif
 
 	uint16_t items_processed = 0;
 	while (items_processed < hlit + hdist) {
+#ifdef DEFLATE_WITH_LUT
+		uint8_t code = deflate_huff(deflate_ll_codes,
+					    deflate_bl_count_ll,
+					    deflate_next_code_ll);
+#else
 		uint8_t code =
 		    deflate_huff(deflate_hc_lengths, sizeof(deflate_hc_lengths),
 				 deflate_bl_count_ll,
 				 deflate_next_code_ll);
+#endif
 		if (code == 16) {
 			uint8_t copy_count = 3 + deflate_get_bits(2);
 			for (uint8_t i = 0; i < copy_count; i++) {
@@ -371,13 +448,22 @@ static int8_t deflate_dynamic_huffman()
 		}
 	}
 
+#ifdef DEFLATE_WITH_LUT
+	deflate_build_alphabet(deflate_lld_lengths, hlit,
+			       deflate_bl_count_ll, deflate_next_code_ll,
+			       deflate_ll_codes);
+	deflate_build_alphabet(deflate_lld_lengths + hlit, hdist,
+			       deflate_bl_count_d, deflate_next_code_d,
+			       deflate_d_codes);
+	return deflate_huffman(deflate_ll_codes, deflate_d_codes);
+#else
 	deflate_build_alphabet(deflate_lld_lengths, hlit,
 			       deflate_bl_count_ll, deflate_next_code_ll);
 	deflate_build_alphabet(deflate_lld_lengths + hlit, hdist,
 			       deflate_bl_count_d, deflate_next_code_d);
-
 	return deflate_huffman(deflate_lld_lengths, hlit,
 			       deflate_lld_lengths + hlit, hdist);
+#endif
 }
 
 int16_t inflate(unsigned char *input_buf, uint16_t input_len,
diff --git a/test/compile-c++11.sh b/test/compile-c++11.sh
index cb13625..b3305a5 100755
--- a/test/compile-c++11.sh
+++ b/test/compile-c++11.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 
-exec g++ -std=c++11 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec g++ -std=c++11 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/compile-c++20.sh b/test/compile-c++20.sh
index 502063b..aa9f02a 100755
--- a/test/compile-c++20.sh
+++ b/test/compile-c++20.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
 # g++ as provided by Debian Buster (used for CI tests) does not support c++20
-exec g++ -std=c++2a -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec g++ -std=c++2a -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/compile-c11.sh b/test/compile-c11.sh
index 87ec632..c9ba1c4 100755
--- a/test/compile-c11.sh
+++ b/test/compile-c11.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 
-exec gcc -std=c11 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec gcc -std=c11 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/compile-c99.sh b/test/compile-c99.sh
index 4583ebf..c13617d 100755
--- a/test/compile-c99.sh
+++ b/test/compile-c99.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 
-exec gcc -std=c99 -Wall -Wextra -pedantic -I../src -o inflate inflate-app.c ../src/inflate.c
+exec gcc -std=c99 -O2 -Wall -Wextra -pedantic -I../src "$@" -o inflate inflate-app.c ../src/inflate.c
diff --git a/test/test.sh b/test/test.sh
index 44104d5..60e3e26 100755
--- a/test/test.sh
+++ b/test/test.sh
@@ -4,10 +4,7 @@ set -eu
 
 cd "$(dirname "$0")"
 
-for std in c++11 c++20 c99 c11; do
-
-	"./compile-${std}.sh"
-
+run_tests() {
 	for file in $(find .. -type f -size -32760c); do
 		if ! ./deflate $file | ./inflate > tmp; then
 			echo "inflate error at $file"
@@ -15,6 +12,21 @@ for std in c++11 c++20 c99 c11; do
 		fi
 		diff $file tmp
 	done
+}
+
+for std in c++11 c++20 c99 c11; do
+
+	"./compile-${std}.sh"
+	run_tests
+
+	"./compile-${std}.sh" -DDEFLATE_CHECKSUM
+	run_tests
+
+	"./compile-${std}.sh" -DDEFLATE_WITH_LUT
+	run_tests
+
+	"./compile-${std}.sh" -DDEFLATE_CHECKSUM -DDEFLATE_WITH_LUT
+	run_tests
 
 done
 
-- 
cgit v1.2.3