1 ! Copyright (C) 2020 Jacob Fischer, Abtin Molavi.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs bit-arrays byte-arrays
4 combinators compression.huffman fry kernel literals locals make
5 math math.bits math.order ranges namespaces sequences
6 sequences.deep splitting vectors ;
17 :: longest-prefix ( ind seq -- start end )
18 ind dup ind + seq length min [a..b]
20 ind swap seq <slice> _ subseq-start
23 :: create-pair ( ind seq -- array )
24 ind seq longest-prefix :> ( start end )
28 [ n ind start - 2array ]
31 : sum-vec ( vec -- n )
32 [ dup array? [ first ] [ drop 1 ] if ] map-sum ;
34 :: compress-lz77 ( seq -- vec )
35 0 seq create-pair seq length <vector> ?push [ dup sum-vec seq length < ] [ dup sum-vec seq create-pair swap ?push ] while ;
37 : gzip-header ( -- header )
38 { 31 139 8 0 0 0 255 } >byte-array ;
42 ! Fixed Huffman table encoding specified in section 3.2.5 of RFC 1951
43 : length-to-code ( length -- code )
45 { [ dup 11 < ] [ 254 + ] }
46 { [ dup 19 < ] [ [ 11 - 2 /i 265 + ] [ 11 - 2 mod 1 <bits> >bit-array ] bi 2array ] }
47 { [ dup 35 < ] [ [ 19 - 4 /i 269 + ] [ 19 - 4 mod 2 <bits> >bit-array ] bi 2array ] }
48 { [ dup 67 < ] [ [ 35 - 8 /i 273 + ] [ 35 - 8 mod 3 <bits> >bit-array ] bi 2array ] }
49 { [ dup 131 < ] [ [ 67 - 16 /i 277 + ] [ 67 - 16 mod 4 <bits> >bit-array ] bi 2array ] }
50 { [ dup 258 < ] [ [ 131 - 32 /i 281 + ] [ 131 - 32 mod 5 <bits> >bit-array ] bi 2array ] }
55 : dist-to-code ( dist -- code )
57 { [ dup 5 < ] [ -1 + ] }
58 { [ dup 9 < ] [ [ 5 - 2 /i 4 + ] [ 5 - 2 mod 1 <bits> >bit-array ] bi 2array ] }
59 { [ dup 17 < ] [ [ 9 - 4 /i 6 + ] [ 9 - 4 mod 2 <bits> >bit-array ] bi 2array ] }
60 { [ dup 33 < ] [ [ 17 - 8 /i 8 + ] [ 17 - 8 mod 3 <bits> >bit-array ] bi 2array ] }
61 { [ dup 65 < ] [ [ 33 - 16 /i 10 + ] [ 33 - 16 mod 4 <bits> >bit-array ] bi 2array ] }
62 { [ dup 129 < ] [ [ 65 - 32 /i 12 + ] [ 65 - 32 mod 5 <bits> >bit-array ] bi 2array ] }
63 { [ dup 257 < ] [ [ 129 - 64 /i 14 + ] [ 129 - 64 mod 6 <bits> >bit-array ] bi 2array ] }
64 { [ dup 513 < ] [ [ 257 - 128 /i 16 + ] [ 257 - 128 mod 7 <bits> >bit-array ] bi 2array ] }
65 { [ dup 1025 < ] [ [ 513 - 256 /i 18 + ] [ 513 - 256 mod 8 <bits> >bit-array ] bi 2array ] }
66 { [ dup 2049 < ] [ [ 1025 - 512 /i 20 + ] [ 1025 - 512 mod 9 <bits> >bit-array ] bi 2array ] }
67 { [ dup 4097 < ] [ [ 2049 - 1024 /i 22 + ] [ 2049 - 1024 mod 10 <bits> >bit-array ] bi 2array ] }
68 { [ dup 8193 < ] [ [ 4097 - 2048 /i 24 + ] [ 4097 - 2048 mod 11 <bits> >bit-array ] bi 2array ] }
69 { [ dup 16385 < ] [ [ 8193 - 4096 /i 26 + ] [ 8193 - 4096 mod 12 <bits> >bit-array ] bi 2array ] }
70 [ [ 8193 - 4096 /i 28 + ] [ 8193 - 4096 mod 13 <bits> >bit-array ] bi 2array ]
74 ! Words for transforming our vector of (length, distance) pairs and bytes into literals using above table
75 : pair-to-code ( pr -- code )
76 [ first length-to-code ] [ second dist-to-code ] bi 2array ;
78 : vec-to-lits ( vec -- vec )
79 [ dup array? [ pair-to-code ] [ ] if ] map ;
81 ! Words for using the fixed Huffman code to map literals to bit arrays
82 ! This is the table in section 3.2.6
83 : (lit-to-bits) ( lit -- bitarr )
85 { [ dup 144 < ] [ 48 + 8 <bits> >bit-array reverse ] }
86 { [ dup 256 < ] [ 144 - 400 + 9 <bits> >bit-array reverse ] }
87 { [ dup 280 < ] [ 256 - 7 <bits> >bit-array reverse ] }
88 [ 280 - 192 + 8 <bits> >bit-array reverse ]
92 ! Gluing codes with their extra bits
94 : dist-to-bits ( dist -- bits )
95 dup array? [ [ first 5 <bits> >bit-array reverse ] [ second ] bi 2array ] [ 5 <bits> >bit-array reverse ] if ;
97 : lit-to-bits ( lit -- bits )
98 dup array? [ [ first (lit-to-bits) ] [ second ] bi 2array ] [ (lit-to-bits) ] if ;
100 : pair-to-bits ( l,d -- bits )
101 [ first lit-to-bits ] [ second dist-to-bits ] bi 2array ;
103 : vec-to-bits ( vec -- bitarr )
104 [ dup array? [ pair-to-bits ] [ (lit-to-bits) ] if ] map ;
107 ! fixed huffman compression function
108 : (compress-fixed) ( bytes -- bits )
109 compress-lz77 vec-to-lits vec-to-bits ;
113 ! using distance code 31 to represent no distance code for particular elements because it cannot occur
114 : dists ( vec -- seq )
115 [ dup array? [ second dup array? [ first ] when ] [ drop 31 ] if ] map 31 swap remove ;
117 : len-lits ( vec -- seq )
118 [ dup array? [ first ] when dup array? [ first ] when ] map ;
120 ! Given an lz77 compressed block, constructs the huffman code tables
121 : build-dicts ( vec -- lit-dict dist-dict )
122 [ len-lits generate-canonical-codes ]
123 [ dists generate-canonical-codes ] bi ;
126 ! Use the given dictionary to replace the element with its code
127 :: replace-one ( ele code-dict -- new-ele )
128 ele array? [ ele first code-dict at ele second 2array ] [ ele code-dict at ] if ;
130 ! replace both elements of a length distance pair with their codes
131 : replace-pair ( pair -- new-pair )
132 [ first lit-dict get replace-one ] [ second dist-dict get replace-one ] bi 2array ;
134 ! Replace all vector elements with their codes
135 : vec-to-codes ( vec -- new-vec )
136 [ dup array? [ replace-pair ] [ lit-dict get replace-one ] if ] map ;
138 ! Dictionary encoding
139 : lit-code-lens ( -- len-seq )
140 285 [0..b] [ lit-dict get at length ] map [ zero? ] trim-tail ;
142 : dist-code-lens ( -- len-seq )
143 31 [0..b] [ dist-dict get at length ] map [ zero? ] trim-tail ;
145 :: replace-0-single ( m len-seq -- new-len-seq )
146 m 11 < [ len-seq m 0 <array> 17 m 3 - 3 <bits> >bit-array 2array 1array replace ]
147 [ len-seq m 0 <array> 18 m 11 - 7 <bits> >bit-array 2array 1array replace ]
150 :: replace-0-range ( range len-seq -- new-len-seq )
151 range empty? [ len-seq ] [ range first range 1 tail len-seq replace-0-range replace-0-single ] if ;
153 : replace-0 ( len-seq -- new-len-seq )
154 2 139 (a..b) swap replace-0-range ;
156 :: replace-runs ( n len-seq -- new-len-seq )
157 len-seq 7 n <array> { n { 16 ?{ t t } } } replace
158 6 n <array> { n { 16 ?{ f t } } } replace
159 5 n <array> { n { 16 ?{ t f } } } replace
160 4 n <array> { n { 16 ?{ f f } } } replace ;
162 :: replace-all-runs ( range len-seq -- new-len-seq )
163 range empty? [ len-seq ] [ range first range 1 tail len-seq replace-all-runs replace-runs ] if ;
165 : run-free-lit ( -- len-seq )
166 0 285 [a..b] lit-code-lens replace-0 replace-all-runs ;
168 : run-free-dist ( -- len-seq )
169 0 31 [a..b] dist-code-lens replace-0 replace-all-runs ;
171 : run-free-codes ( -- len-seq )
172 run-free-lit run-free-dist append ;
174 : code-len-dict ( -- code-dict )
175 run-free-codes [ dup array? [ first ] when ] map generate-canonical-codes ;
177 : compressed-lens ( -- len-seq )
178 run-free-codes [ dup array? [ [ first code-len-dict at ] [ second ] bi 2array ] [ code-len-dict at ] if ] map ;
180 CONSTANT: clen-shuffle { 16 17 18 0 8 7 9 6 10 5 11 4 12 3 13 2 14 1 15 }
182 : clen-seq ( -- len-seq )
183 clen-shuffle [ code-len-dict at length ] map [ zero? ] trim-tail ;
185 : clen-bits ( -- bit-arr )
186 clen-seq [ 3 <bits> >bit-array ] map ;
188 : h-lit ( -- bit-arr )
189 lit-code-lens length 257 - 5 <bits> >bit-array ;
191 : h-dist ( -- bit-arr )
192 dist-code-lens length 1 - 5 <bits> >bit-array ;
194 : h-clen ( -- bit-arr )
195 clen-seq length 4 - 4 <bits> >bit-array ;
197 : dynamic-headers ( -- bit-arr-seq )
198 ?{ f t } h-lit h-dist h-clen 4array concat ;
201 { headers bit-array }
203 { compressed-lens array }
204 { compressed-data vector } ;
206 ! Compresses a block with dynamic huffman compression, outputting a nested array structure
207 : (compress-dynamic) ( lit-seq -- bit-arr-seq )
208 [ dup compress-lz77 vec-to-lits { 256 } append lit-vec set
209 lit-vec get build-dicts
212 lit-code-lens supremum 16 < clen-seq supremum 8 < and
213 [ drop dynamic-headers clen-bits compressed-lens
214 lit-vec get vec-to-codes deflate-block boa ]
215 [ halves [ (compress-dynamic) ] bi@ 2array ] if
219 : flatten-single ( ele -- bits )
220 dup array? [ concat ] when ;
222 : flatten-lens ( compressed-lens -- bits )
223 [ flatten-single ] map concat ;
225 : flatten-pair ( pair -- bits )
226 dup array? [ [ first flatten-single ] [ second flatten-single ] bi append ] when ;
228 : flatten-block ( bit-arr-seq -- byte-array )
229 { [ headers>> ] [ clen>> concat ] [ compressed-lens>> flatten-lens ] [ compressed-data>> [ flatten-pair ] map concat ] } cleave 4array concat ;
231 : flatten-blocks ( blocks -- byte-array )
232 [ flatten-block ] map unclip-last [ [ ?{ f } prepend ] map ] dip ?{ t } prepend suffix concat ;
236 : compress-dynamic ( byte-array -- byte-array )
237 (compress-dynamic) [ deflate-block? ] deep-filter flatten-blocks underlying>> gzip-header prepend B{ 0 0 } append ;
239 : compress-fixed ( byte-array -- byte-array )
240 (compress-fixed) [ flatten-pair ] map concat ?{ t t f } prepend underlying>> gzip-header prepend B{ 0 0 } append ;