1 ! Copyright (C) 2006, 2009 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: math kernel sequences sbufs vectors namespaces io.binary
4 io.encodings combinators splitting io byte-arrays ;
19 : append-nums ( byte ch -- ch )
20 over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
22 : double-be ( stream byte -- stream char )
23 over stream-read1 swap append-nums ;
25 : quad-be ( stream byte -- stream char )
26 double-be over stream-read1 [
27 dup -2 shift BIN: 110111 number= [
28 [ 2 shift ] dip BIN: 11 bitand bitor
29 over stream-read1 swap append-nums HEX: 10000 +
30 ] [ 2drop dup stream-read1 drop replacement-char ] if
33 : ignore ( stream -- stream char )
34 dup stream-read1 drop replacement-char ;
36 : begin-utf16be ( stream byte -- stream char )
37 dup -3 shift BIN: 11011 number= [
38 dup BIN: 00000100 bitand zero?
39 [ BIN: 11 bitand quad-be ]
43 M: utf16be decode-char
44 drop dup stream-read1 dup [ begin-utf16be ] when nip ;
48 : quad-le ( stream ch -- stream char )
49 over stream-read1 swap 10 shift bitor
50 over stream-read1 dup -2 shift BIN: 110111 = [
51 BIN: 11 bitand append-nums HEX: 10000 +
52 ] [ 2drop replacement-char ] if ;
54 : double-le ( stream byte1 byte2 -- stream char )
55 dup -3 shift BIN: 11011 = [
56 dup BIN: 100 bitand 0 number=
57 [ BIN: 11 bitand 8 shift bitor quad-le ]
58 [ 2drop replacement-char ] if
59 ] [ append-nums ] if ;
61 : begin-utf16le ( stream byte -- stream char )
62 over stream-read1 dup [ double-le ] [ 2drop replacement-char ] if ;
64 M: utf16le decode-char
65 drop dup stream-read1 dup [ begin-utf16le ] when nip ;
67 ! UTF-16LE/BE encoding
69 : encode-first ( char -- byte1 byte2 )
71 [ -8 shift BIN: 11011000 bitor ] [ HEX: FF bitand ] bi ;
73 : encode-second ( char -- byte3 byte4 )
74 BIN: 1111111111 bitand
75 [ -8 shift BIN: 11011100 bitor ] [ BIN: 11111111 bitand ] bi ;
77 : stream-write2 ( char1 char2 stream -- )
78 [ stream-write1 ] curry bi@ ;
80 : char>utf16be ( char stream -- )
83 [ [ encode-first ] dip stream-write2 ]
84 [ [ encode-second ] dip stream-write2 ] 2bi
85 ] [ [ h>b/b swap ] dip stream-write2 ] if ;
87 M: utf16be encode-char ( char stream encoding -- )
90 : char>utf16le ( stream char -- )
93 [ [ encode-first swap ] dip stream-write2 ]
94 [ [ encode-second swap ] dip stream-write2 ] 2bi
95 ] [ [ h>b/b ] dip stream-write2 ] if ;
97 M: utf16le encode-char ( char stream encoding -- )
102 CONSTANT: bom-le B{ HEX: ff HEX: fe }
104 CONSTANT: bom-be B{ HEX: fe HEX: ff }
106 : bom>le/be ( bom -- le/be )
107 dup bom-le sequence= [ drop utf16le ] [
108 bom-be sequence= [ utf16be ] [ missing-bom ] if
111 M: utf16 <decoder> ( stream utf16 -- decoder )
112 drop 2 over stream-read bom>le/be <decoder> ;
114 M: utf16 <encoder> ( stream utf16 -- encoder )
115 drop bom-le over stream-write utf16le <encoder> ;