1 ! Copyright (C) 2006, 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors byte-arrays math math.order kernel sequences
4 sbufs vectors growable io continuations namespaces io.encodings
14 : starts-2? ( char -- ? )
15 dup [ -6 shift BIN: 10 number= ] when ; inline
17 : append-nums ( stream byte -- stream char )
18 over stream-read1 dup starts-2?
19 [ swap 6 shift swap BIN: 111111 bitand bitor ]
20 [ 2drop replacement-char ] if ; inline
22 : minimum-code-point ( char minimum -- char )
23 over > [ drop replacement-char ] when ;
25 : double ( stream byte -- stream char )
26 BIN: 11111 bitand append-nums
27 HEX: 80 minimum-code-point ; inline
29 : triple ( stream byte -- stream char )
30 BIN: 1111 bitand append-nums append-nums
31 HEX: 800 minimum-code-point ; inline
33 : quadruple ( stream byte -- stream char )
34 BIN: 111 bitand append-nums append-nums append-nums
35 HEX: 10000 minimum-code-point ; inline
37 : begin-utf8 ( stream byte -- stream char )
39 { [ dup -7 shift zero? ] [ ] }
40 { [ dup -5 shift BIN: 110 = ] [ double ] }
41 { [ dup -4 shift BIN: 1110 = ] [ triple ] }
42 { [ dup -3 shift BIN: 11110 = ] [ quadruple ] }
43 [ drop replacement-char ]
46 : decode-utf8 ( stream -- char/f )
47 dup stream-read1 dup [ begin-utf8 ] when nip ; inline
50 drop decode-utf8 ; inline
54 : encoded ( stream char -- )
55 BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ; inline
57 : char>utf8 ( char stream -- )
59 { [ dup -7 shift zero? ] [ swap stream-write1 ] }
60 { [ dup -11 shift zero? ] [
61 2dup -6 shift BIN: 11000000 bitor swap stream-write1
64 { [ dup -16 shift zero? ] [
65 2dup -12 shift BIN: 11100000 bitor swap stream-write1
70 2dup -18 shift BIN: 11110000 bitor swap stream-write1
71 2dup -12 shift encoded
83 [ [ char>utf8 ] curry each ]
84 [ [ >byte-array ] dip stream-write ] if ;
88 : code-point-length ( n -- x )
91 { [ dup 0 6 between? ] [ 1 ] }
92 { [ dup 7 10 between? ] [ 2 ] }
93 { [ dup 11 15 between? ] [ 3 ] }
94 { [ dup 16 20 between? ] [ 4 ] }
98 : code-point-offsets ( string -- indices )
99 0 [ code-point-length + ] accumulate swap suffix ;
101 : utf8-index> ( n string -- n' )
102 code-point-offsets [ <= ] with find drop ;
104 : >utf8-index ( n string -- n' )
105 code-point-offsets nth ;