1 ! Copyright (C) 2006, 2009 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors alien.accessors byte-arrays io io.binary
4 io.encodings kernel math math.private sequences
5 sequences.private strings strings.private ;
20 : append-nums ( byte ch -- ch )
21 over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
23 : double-be ( stream byte -- stream char )
24 over stream-read1 swap append-nums ;
26 : quad-be ( stream byte -- stream char )
27 double-be over stream-read1 [
28 dup -2 shift 0b110111 number= [
29 [ 2 shift ] dip 0b11 bitand bitor
30 over stream-read1 swap append-nums 0x10000 +
31 ] [ 2drop dup stream-read1 drop replacement-char ] if
34 : ignore ( stream -- stream char )
35 dup stream-read1 drop replacement-char ;
37 : begin-utf16be ( stream byte -- stream char )
38 dup -3 shift 0b11011 number= [
39 dup 0b00000100 bitand zero?
40 [ 0b11 bitand quad-be ]
44 M: utf16be decode-char
45 drop dup stream-read1 dup [ begin-utf16be ] when nip ;
49 : quad-le ( stream ch -- stream char )
50 over stream-read1 swap 10 shift bitor
51 over stream-read1 dup -2 shift 0b110111 = [
52 0b11 bitand append-nums 0x10000 +
53 ] [ 2drop replacement-char ] if ;
55 : double-le ( stream byte1 byte2 -- stream char )
56 dup -3 shift 0b11011 = [
57 dup 0b100 bitand 0 number=
58 [ 0b11 bitand 8 shift bitor quad-le ]
59 [ 2drop replacement-char ] if
60 ] [ append-nums ] if ;
62 : begin-utf16le ( stream byte -- stream char )
63 over stream-read1 [ double-le ] [ drop replacement-char ] if* ;
65 M: utf16le decode-char
66 drop dup stream-read1 dup [ begin-utf16le ] when nip ;
68 ! UTF-16LE/BE encoding
70 : encode-first ( char -- byte1 byte2 )
72 [ -8 shift 0b11011000 bitor ] [ 0xFF bitand ] bi ; inline
74 : encode-second ( char -- byte3 byte4 )
76 [ -8 shift 0b11011100 bitor ] [ 0b11111111 bitand ] bi ; inline
78 : stream-write2 ( char1 char2 stream -- )
79 [ B{ } 2sequence ] dip stream-write ; inline
80 ! [ stream-write1 ] curry bi@ ; inline
82 : char>utf16be ( char stream -- )
85 [ [ encode-first ] dip stream-write2 ]
86 [ [ encode-second ] dip stream-write2 ] 2bi
87 ] [ [ h>b/b swap ] dip stream-write2 ] if ; inline
89 M: utf16be encode-char ( char stream encoding -- )
92 : char>utf16le ( char stream -- )
95 [ [ encode-first swap ] dip stream-write2 ]
96 [ [ encode-second swap ] dip stream-write2 ] 2bi
97 ] [ [ h>b/b ] dip stream-write2 ] if ; inline
99 M: utf16le encode-char ( char stream encoding -- )
102 : ascii-char>utf16-byte-array ( off n byte-array string -- )
103 overd string-nth-fast -rot
104 [ 2 fixnum*fast rot fixnum+fast ] dip
105 set-nth-unsafe ; inline
107 : ascii-string>utf16-byte-array ( off string -- byte-array )
108 [ length >fixnum [ <iota> ] [ 2 fixnum*fast <byte-array> ] bi ] keep
109 [ [ ascii-char>utf16-byte-array ] 2curry with each ] keepd ; inline
111 : ascii-string>utf16le ( string stream -- )
112 [ 0 swap ascii-string>utf16-byte-array ] dip stream-write ; inline
113 : ascii-string>utf16be ( string stream -- )
114 [ 1 swap ascii-string>utf16-byte-array ] dip stream-write ; inline
116 GENERIC#: encode-string-utf16le 1 ( string stream -- )
118 M: object encode-string-utf16le
119 [ char>utf16le ] curry each ; inline
121 M: string encode-string-utf16le
124 [ ascii-string>utf16le ] if ; inline
126 M: utf16le encode-string drop encode-string-utf16le ;
128 GENERIC#: encode-string-utf16be 1 ( string stream -- )
130 M: object encode-string-utf16be
131 [ char>utf16be ] curry each ; inline
133 M: string encode-string-utf16be
136 [ ascii-string>utf16be ] if ; inline
138 M: utf16be encode-string drop encode-string-utf16be ;
140 M: utf16le guess-encoded-length drop 2 * ; inline
141 M: utf16le guess-decoded-length drop 2 /i ; inline
143 M: utf16be guess-encoded-length drop 2 * ; inline
144 M: utf16be guess-decoded-length drop 2 /i ; inline
148 CONSTANT: bom-le B{ 0xff 0xfe }
150 CONSTANT: bom-be B{ 0xfe 0xff }
152 : bom>le/be ( bom -- le/be )
153 dup bom-le sequence= [ drop utf16le ] [
154 bom-be sequence= [ utf16be ] [ missing-bom ] if
157 M: utf16 <decoder> ( stream utf16 -- decoder )
158 drop 2 over stream-read bom>le/be <decoder> ;
160 M: utf16 <encoder> ( stream utf16 -- encoder )
161 drop bom-le over stream-write utf16le <encoder> ;
163 : le? ( -- ? ) B{ 1 0 0 0 } 0 alien-unsigned-4 1 = ; foldable
167 : utf16n ( -- value ) le? utf16le utf16be ? ;