1 ! (c)2010 Joe Groff bsd license
\r
2 USING: accessors byte-arrays byte-arrays.hex combinators
\r
3 continuations fry io io.encodings io.encodings.8-bit.latin1
\r
4 io.encodings.ascii io.encodings.binary io.encodings.iana
\r
5 io.encodings.string io.encodings.utf16 io.encodings.utf32
\r
6 io.encodings.utf8 io.files io.streams.string kernel literals
\r
7 math namespaces sequences strings ;
\r
8 IN: io.encodings.detect
\r
10 SYMBOL: default-8bit-encoding
\r
11 default-8bit-encoding [ latin1 ] initialize
\r
15 : prolog-tag ( bytes -- string )
\r
16 CHAR: > over index [ 1 + head-slice ] when* >string ;
\r
18 : prolog-encoding ( string -- iana-encoding )
\r
20 _ "encoding=" over start
\r
21 10 + swap [ [ 1 - ] dip nth ] [ index-from ] [ swapd subseq ] 2tri
\r
22 ] [ drop "UTF-8" ] recover ;
\r
24 : detect-xml-prolog ( bytes -- encoding )
\r
25 prolog-tag prolog-encoding name>encoding [ ascii ] unless* ;
\r
27 : valid-utf8? ( bytes -- ? )
\r
28 utf8 decode 1 head-slice* replacement-char swap member? not ;
\r
32 : detect-byte-array ( bytes -- encoding )
\r
34 { [ dup HEX{ 0000FEFF } head? ] [ drop utf32be ] }
\r
35 { [ dup HEX{ FFFE0000 } head? ] [ drop utf32le ] }
\r
36 { [ dup HEX{ FEFF } head? ] [ drop utf16be ] }
\r
37 { [ dup HEX{ FFFE } head? ] [ drop utf16le ] }
\r
38 { [ dup HEX{ EF BB BF } head? ] [ drop utf8 ] }
\r
39 { [ dup $[ "<?xml" >byte-array ] head? ] [ detect-xml-prolog ] }
\r
40 { [ 0 over member? ] [ drop binary ] }
\r
41 { [ dup empty? ] [ drop utf8 ] }
\r
42 { [ dup valid-utf8? ] [ drop utf8 ] }
\r
43 [ drop default-8bit-encoding get ]
\r
46 : detect-stream ( stream -- sample encoding )
\r
47 256 swap stream-read dup detect-byte-array ;
\r
49 : detect-file ( file -- encoding )
\r
50 binary [ input-stream get detect-stream nip ] with-file-reader ;
\r