1 ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: math kernel sequences sbufs vectors namespaces io.binary
4 io.encodings combinators splitting ;
13 : do-ignore ( -- ch state ) 0 ignore ;
15 : append-nums ( byte ch -- ch )
18 : end-multibyte ( buf byte ch -- buf ch state )
21 : begin-utf16be ( buf byte -- buf ch state )
22 dup -3 shift BIN: 11011 number= [
23 dup BIN: 00000100 bitand zero?
24 [ BIN: 11 bitand quad1 ]
28 : handle-quad2be ( byte ch -- ch state )
29 swap dup -2 shift BIN: 110111 number= [
30 >r 2 shift r> BIN: 11 bitand bitor quad3
31 ] [ 2drop do-ignore ] if ;
33 : decode-utf16be-step ( buf byte ch state -- buf ch state )
35 { begin [ drop begin-utf16be ] }
36 { double [ end-multibyte ] }
37 { quad1 [ append-nums quad2 ] }
38 { quad2 [ handle-quad2be ] }
39 { quad3 [ append-nums HEX: 10000 + decoded ] }
40 { ignore [ 2drop push-replacement ] }
43 : decode-utf16be ( seq -- str )
44 [ decode-utf16be-step ] decode ;
46 : handle-double ( buf byte ch -- buf ch state )
47 swap dup -3 shift BIN: 11011 = [
48 dup BIN: 100 bitand 0 number=
49 [ BIN: 11 bitand 8 shift bitor quad2 ]
50 [ 2drop push-replacement ] if
51 ] [ end-multibyte ] if ;
53 : handle-quad3le ( buf byte ch -- buf ch state )
54 swap dup -2 shift BIN: 110111 = [
55 BIN: 11 bitand append-nums HEX: 10000 + decoded
56 ] [ 2drop push-replacement ] if ;
58 : decode-utf16le-step ( buf byte ch state -- buf ch state )
60 { begin [ drop double ] }
61 { double [ handle-double ] }
62 { quad1 [ append-nums quad2 ] }
63 { quad2 [ 10 shift bitor quad3 ] }
64 { quad3 [ handle-quad3le ] }
67 : decode-utf16le ( seq -- str )
68 [ decode-utf16le-step ] decode ;
72 dup -8 shift BIN: 11011000 bitor
76 BIN: 1111111111 bitand
77 dup -8 shift BIN: 11011100 bitor
78 swap BIN: 11111111 bitand ;
80 : char>utf16be ( char -- )
83 dup encode-first swap , ,
84 encode-second swap , ,
87 : encode-utf16be ( str -- seq )
88 [ [ char>utf16be ] each ] B{ } make ;
90 : char>utf16le ( char -- )
95 ] [ h>b/b swap , , ] if ;
97 : encode-utf16le ( str -- seq )
98 [ [ char>utf16le ] each ] B{ } make ;
100 : bom-le B{ HEX: ff HEX: fe } ; inline
102 : bom-be B{ HEX: fe HEX: ff } ; inline
104 : encode-utf16 ( str -- seq )
105 encode-utf16le bom-le swap append ;
107 : decode-utf16 ( seq -- str )
109 { [ bom-le ?head ] [ decode-utf16le ] }
110 { [ bom-be ?head ] [ decode-utf16be ] }
111 { [ t ] [ decode-error ] }
115 : <utf16le> utf16le construct-delegate ;
116 INSTANCE: utf16le encoding-stream
118 M: utf16le encode-string drop encode-utf16le ;
119 M: utf16le decode-step drop decode-utf16le-step ;
122 : <utf16be> utf16be construct-delegate ;
123 INSTANCE: utf16be encoding-stream
125 M: utf16be encode-string drop encode-utf16be ;
126 M: utf16be decode-step drop decode-utf16be-step ;