]> gitweb.factorcode.org Git - factor.git/blob - extra/io/encodings/detect/detect.factor
Update some copyright headers to follow the current convention
[factor.git] / extra / io / encodings / detect / detect.factor
1 ! Copyright (C) 2010 Joe Groff.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors byte-arrays combinators continuations fry io
4 io.encodings io.encodings.8-bit.latin1 io.encodings.ascii
5 io.encodings.binary io.encodings.iana io.encodings.string
6 io.encodings.utf16 io.encodings.utf32 io.encodings.utf8
7 io.files io.streams.string kernel literals math namespaces
8 sequences strings ;
9 IN: io.encodings.detect
10
11 SYMBOL: default-8bit-encoding
12 default-8bit-encoding [ latin1 ] initialize
13
14 <PRIVATE
15
16 : prolog-tag ( bytes -- string )
17     CHAR: > over index [ 1 + head-slice ] when* >string ;
18
19 : prolog-encoding ( string -- iana-encoding )
20     '[
21         _ "encoding=" over subseq-start
22         10 + swap [ [ 1 - ] dip nth ] [ index-from ] [ swapd subseq ] 2tri
23     ] [ drop "UTF-8" ] recover ;
24
25 : detect-xml-prolog ( bytes -- encoding )
26     prolog-tag prolog-encoding name>encoding [ ascii ] unless* ;
27
28 : valid-utf8? ( bytes -- ? )
29     utf8 decode but-last-slice replacement-char swap member? not ;
30
31 PRIVATE>
32
33 : detect-byte-array ( bytes -- encoding )
34     {
35         { [ dup B{ 0x00 0x00 0xFE 0xFF } head? ] [ drop utf32be ] }
36         { [ dup B{ 0xFF 0xFE 0x00 0x00 } head? ] [ drop utf32le ] }
37         { [ dup B{ 0xFE 0xFF } head? ] [ drop utf16be ] }
38         { [ dup B{ 0xFF 0xFE } head? ] [ drop utf16le ] }
39         { [ dup B{ 0xEF 0xBB 0xBF } head? ] [ drop utf8 ] }
40         { [ dup $[ "<?xml" >byte-array ] head? ] [ detect-xml-prolog ] }
41         { [ 0 over member? ] [ drop binary ] }
42         { [ dup empty? ] [ drop utf8 ] }
43         { [ dup valid-utf8? ] [ drop utf8 ] }
44         [ drop default-8bit-encoding get ]
45     } cond ;
46
47 : detect-stream ( stream -- sample encoding )
48     256 swap stream-read dup detect-byte-array ;
49
50 : detect-file ( file -- encoding )
51     binary [ input-stream get detect-stream nip ] with-file-reader ;