1 ! Copyright (C) 2020 Jacob Fischer, Abtin Molavi.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs bit-arrays byte-arrays
4 combinators compression.huffman kernel math math.bits math.order
5 namespaces ranges sequences sequences.deep splitting vectors ;
16 :: longest-prefix ( ind seq -- start end )
17 ind dup ind + seq length min [a..b]
19 ind swap seq <slice> _ subseq-start
22 :: create-pair ( ind seq -- array )
23 ind seq longest-prefix :> ( start end )
27 [ n ind start - 2array ]
30 : sum-vec ( vec -- n )
31 [ dup array? [ first ] [ drop 1 ] if ] map-sum ;
33 :: compress-lz77 ( seq -- vec )
34 0 seq create-pair seq length <vector> ?push [ dup sum-vec seq length < ] [ dup sum-vec seq create-pair swap ?push ] while ;
36 : gzip-header ( -- header )
37 { 31 139 8 0 0 0 255 } >byte-array ;
41 ! Fixed Huffman table encoding specified in section 3.2.5 of RFC 1951
42 : length-to-code ( length -- code )
44 { [ dup 11 < ] [ 254 + ] }
45 { [ dup 19 < ] [ [ 11 - 2 /i 265 + ] [ 11 - 2 mod 1 <bits> >bit-array ] bi 2array ] }
46 { [ dup 35 < ] [ [ 19 - 4 /i 269 + ] [ 19 - 4 mod 2 <bits> >bit-array ] bi 2array ] }
47 { [ dup 67 < ] [ [ 35 - 8 /i 273 + ] [ 35 - 8 mod 3 <bits> >bit-array ] bi 2array ] }
48 { [ dup 131 < ] [ [ 67 - 16 /i 277 + ] [ 67 - 16 mod 4 <bits> >bit-array ] bi 2array ] }
49 { [ dup 258 < ] [ [ 131 - 32 /i 281 + ] [ 131 - 32 mod 5 <bits> >bit-array ] bi 2array ] }
54 : dist-to-code ( dist -- code )
56 { [ dup 5 < ] [ -1 + ] }
57 { [ dup 9 < ] [ [ 5 - 2 /i 4 + ] [ 5 - 2 mod 1 <bits> >bit-array ] bi 2array ] }
58 { [ dup 17 < ] [ [ 9 - 4 /i 6 + ] [ 9 - 4 mod 2 <bits> >bit-array ] bi 2array ] }
59 { [ dup 33 < ] [ [ 17 - 8 /i 8 + ] [ 17 - 8 mod 3 <bits> >bit-array ] bi 2array ] }
60 { [ dup 65 < ] [ [ 33 - 16 /i 10 + ] [ 33 - 16 mod 4 <bits> >bit-array ] bi 2array ] }
61 { [ dup 129 < ] [ [ 65 - 32 /i 12 + ] [ 65 - 32 mod 5 <bits> >bit-array ] bi 2array ] }
62 { [ dup 257 < ] [ [ 129 - 64 /i 14 + ] [ 129 - 64 mod 6 <bits> >bit-array ] bi 2array ] }
63 { [ dup 513 < ] [ [ 257 - 128 /i 16 + ] [ 257 - 128 mod 7 <bits> >bit-array ] bi 2array ] }
64 { [ dup 1025 < ] [ [ 513 - 256 /i 18 + ] [ 513 - 256 mod 8 <bits> >bit-array ] bi 2array ] }
65 { [ dup 2049 < ] [ [ 1025 - 512 /i 20 + ] [ 1025 - 512 mod 9 <bits> >bit-array ] bi 2array ] }
66 { [ dup 4097 < ] [ [ 2049 - 1024 /i 22 + ] [ 2049 - 1024 mod 10 <bits> >bit-array ] bi 2array ] }
67 { [ dup 8193 < ] [ [ 4097 - 2048 /i 24 + ] [ 4097 - 2048 mod 11 <bits> >bit-array ] bi 2array ] }
68 { [ dup 16385 < ] [ [ 8193 - 4096 /i 26 + ] [ 8193 - 4096 mod 12 <bits> >bit-array ] bi 2array ] }
69 [ [ 8193 - 4096 /i 28 + ] [ 8193 - 4096 mod 13 <bits> >bit-array ] bi 2array ]
73 ! Words for transforming our vector of (length, distance) pairs and bytes into literals using above table
74 : pair-to-code ( pr -- code )
75 [ first length-to-code ] [ second dist-to-code ] bi 2array ;
77 : vec-to-lits ( vec -- vec )
78 [ dup array? [ pair-to-code ] [ ] if ] map ;
80 ! Words for using the fixed Huffman code to map literals to bit arrays
81 ! This is the table in section 3.2.6
82 : (lit-to-bits) ( lit -- bitarr )
84 { [ dup 144 < ] [ 48 + 8 <bits> >bit-array reverse ] }
85 { [ dup 256 < ] [ 144 - 400 + 9 <bits> >bit-array reverse ] }
86 { [ dup 280 < ] [ 256 - 7 <bits> >bit-array reverse ] }
87 [ 280 - 192 + 8 <bits> >bit-array reverse ]
91 ! Gluing codes with their extra bits
93 : dist-to-bits ( dist -- bits )
94 dup array? [ [ first 5 <bits> >bit-array reverse ] [ second ] bi 2array ] [ 5 <bits> >bit-array reverse ] if ;
96 : lit-to-bits ( lit -- bits )
97 dup array? [ [ first (lit-to-bits) ] [ second ] bi 2array ] [ (lit-to-bits) ] if ;
99 : pair-to-bits ( l,d -- bits )
100 [ first lit-to-bits ] [ second dist-to-bits ] bi 2array ;
102 : vec-to-bits ( vec -- bitarr )
103 [ dup array? [ pair-to-bits ] [ (lit-to-bits) ] if ] map ;
106 ! fixed huffman compression function
107 : (compress-fixed) ( bytes -- bits )
108 compress-lz77 vec-to-lits vec-to-bits ;
112 ! using distance code 31 to represent no distance code for particular elements because it cannot occur
113 : dists ( vec -- seq )
114 [ dup array? [ second dup array? [ first ] when ] [ drop 31 ] if ] map 31 swap remove ;
116 : len-lits ( vec -- seq )
117 [ dup array? [ first ] when dup array? [ first ] when ] map ;
119 ! Given an lz77 compressed block, constructs the huffman code tables
120 : build-dicts ( vec -- lit-dict dist-dict )
121 [ len-lits generate-canonical-codes ]
122 [ dists generate-canonical-codes ] bi ;
125 ! Use the given dictionary to replace the element with its code
126 :: replace-one ( ele code-dict -- new-ele )
127 ele array? [ ele first code-dict at ele second 2array ] [ ele code-dict at ] if ;
129 ! replace both elements of a length distance pair with their codes
130 : replace-pair ( pair -- new-pair )
131 [ first lit-dict get replace-one ] [ second dist-dict get replace-one ] bi 2array ;
133 ! Replace all vector elements with their codes
134 : vec-to-codes ( vec -- new-vec )
135 [ dup array? [ replace-pair ] [ lit-dict get replace-one ] if ] map ;
137 ! Dictionary encoding
138 : lit-code-lens ( -- len-seq )
139 285 [0..b] [ lit-dict get at length ] map [ zero? ] trim-tail ;
141 : dist-code-lens ( -- len-seq )
142 31 [0..b] [ dist-dict get at length ] map [ zero? ] trim-tail ;
144 :: replace-0-single ( m len-seq -- new-len-seq )
145 m 11 < [ len-seq m 0 <array> 17 m 3 - 3 <bits> >bit-array 2array 1array replace ]
146 [ len-seq m 0 <array> 18 m 11 - 7 <bits> >bit-array 2array 1array replace ]
149 :: replace-0-range ( range len-seq -- new-len-seq )
150 range empty? [ len-seq ] [ range first range 1 tail len-seq replace-0-range replace-0-single ] if ;
152 : replace-0 ( len-seq -- new-len-seq )
153 2 139 (a..b) swap replace-0-range ;
155 :: replace-runs ( n len-seq -- new-len-seq )
156 len-seq 7 n <array> { n { 16 ?{ t t } } } replace
157 6 n <array> { n { 16 ?{ f t } } } replace
158 5 n <array> { n { 16 ?{ t f } } } replace
159 4 n <array> { n { 16 ?{ f f } } } replace ;
161 :: replace-all-runs ( range len-seq -- new-len-seq )
162 range empty? [ len-seq ] [ range first range 1 tail len-seq replace-all-runs replace-runs ] if ;
164 : run-free-lit ( -- len-seq )
165 0 285 [a..b] lit-code-lens replace-0 replace-all-runs ;
167 : run-free-dist ( -- len-seq )
168 0 31 [a..b] dist-code-lens replace-0 replace-all-runs ;
170 : run-free-codes ( -- len-seq )
171 run-free-lit run-free-dist append ;
173 : code-len-dict ( -- code-dict )
174 run-free-codes [ dup array? [ first ] when ] map generate-canonical-codes ;
176 : compressed-lens ( -- len-seq )
177 run-free-codes [ dup array? [ [ first code-len-dict at ] [ second ] bi 2array ] [ code-len-dict at ] if ] map ;
179 CONSTANT: clen-shuffle { 16 17 18 0 8 7 9 6 10 5 11 4 12 3 13 2 14 1 15 }
181 : clen-seq ( -- len-seq )
182 clen-shuffle [ code-len-dict at length ] map [ zero? ] trim-tail ;
184 : clen-bits ( -- bit-arr )
185 clen-seq [ 3 <bits> >bit-array ] map ;
187 : h-lit ( -- bit-arr )
188 lit-code-lens length 257 - 5 <bits> >bit-array ;
190 : h-dist ( -- bit-arr )
191 dist-code-lens length 1 - 5 <bits> >bit-array ;
193 : h-clen ( -- bit-arr )
194 clen-seq length 4 - 4 <bits> >bit-array ;
196 : dynamic-headers ( -- bit-arr-seq )
197 ?{ f t } h-lit h-dist h-clen 4array concat ;
200 { headers bit-array }
202 { compressed-lens array }
203 { compressed-data vector } ;
205 ! Compresses a block with dynamic huffman compression, outputting a nested array structure
206 : (compress-dynamic) ( lit-seq -- bit-arr-seq )
207 [ dup compress-lz77 vec-to-lits { 256 } append lit-vec set
208 lit-vec get build-dicts
211 lit-code-lens supremum 16 < clen-seq supremum 8 < and
212 [ drop dynamic-headers clen-bits compressed-lens
213 lit-vec get vec-to-codes deflate-block boa ]
214 [ halves [ (compress-dynamic) ] bi@ 2array ] if
218 : flatten-single ( ele -- bits )
219 dup array? [ concat ] when ;
221 : flatten-lens ( compressed-lens -- bits )
222 [ flatten-single ] map concat ;
224 : flatten-pair ( pair -- bits )
225 dup array? [ [ first flatten-single ] [ second flatten-single ] bi append ] when ;
227 : flatten-block ( bit-arr-seq -- byte-array )
228 { [ headers>> ] [ clen>> concat ] [ compressed-lens>> flatten-lens ] [ compressed-data>> [ flatten-pair ] map concat ] } cleave 4array concat ;
230 : flatten-blocks ( blocks -- byte-array )
231 [ flatten-block ] map unclip-last [ [ ?{ f } prepend ] map ] dip ?{ t } prepend suffix concat ;
235 : compress-dynamic ( byte-array -- byte-array )
236 (compress-dynamic) [ deflate-block? ] deep-filter flatten-blocks underlying>> gzip-header prepend B{ 0 0 } append ;
238 : compress-fixed ( byte-array -- byte-array )
239 (compress-fixed) [ flatten-pair ] map concat ?{ t t f } prepend underlying>> gzip-header prepend B{ 0 0 } append ;