1 ! Copyright (C) 2005, 2009 Daniel Ehrenberg
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
4 io.encodings.utf16 xml.tokenize xml.state math ascii sequences
5 io.encodings.string io.encodings combinators accessors
6 xml.data io.encodings.iana ;
9 : decode-stream ( encoding -- )
10 spot get [ swap re-decode ] change-stream drop ;
12 : continue-make-tag ( str -- tag )
13 parse-name-starting middle-tag end-tag ;
15 : start-utf16le ( -- tag )
20 : 10xxxxxx? ( ch -- ? )
21 -6 shift 3 bitand 2 = ;
23 : start<name ( ch -- tag )
24 ! This is unfortunate, and exists for the corner case
25 ! that the first letter of the document is < and second is
28 [ utf8 decode-stream next make-tag ] [
30 [ get-next 10xxxxxx? not ] take-until
31 get-char suffix utf8 decode
32 utf8 decode-stream next
36 : prolog-encoding ( prolog -- )
37 encoding>> dup "UTF-16" =
38 [ drop ] [ name>encoding [ decode-stream ] when* ] if ;
40 : instruct-encoding ( instruct/prolog -- )
43 [ drop utf8 decode-stream ] if ;
46 check utf8 decode-stream next next ;
49 ! What if first letter of processing instruction is non-ASCII?
51 { 0 [ next next start-utf16le ] }
52 { CHAR: ? [ go-utf8 instruct dup instruct-encoding ] }
53 { CHAR: ! [ go-utf8 direct ] }
57 : skip-utf8-bom ( -- tag )
58 "\u0000bb\u0000bf" expect utf8 decode-stream
59 "<" expect check make-tag ;
61 : decode-expecting ( encoding string -- tag )
62 [ decode-stream next ] [ expect ] bi* check make-tag ;
64 : start-utf16be ( -- tag )
65 utf16be "<" decode-expecting ;
67 : skip-utf16le-bom ( -- tag )
68 utf16le "\u0000fe<" decode-expecting ;
70 : skip-utf16be-bom ( -- tag )
71 utf16be "\u0000ff<" decode-expecting ;
73 : start-document ( -- tag )
75 { CHAR: < [ start< ] }
76 { 0 [ start-utf16be ] }
77 { HEX: EF [ skip-utf8-bom ] }
78 { HEX: FF [ skip-utf16le-bom ] }
79 { HEX: FE [ skip-utf16be-bom ] }
80 [ drop utf8 decode-stream check f ]