1 ! Copyright (C) 2006, 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: math math.order kernel sequences sbufs vectors growable io
4 continuations namespaces io.encodings combinators strings ;
13 : starts-2? ( char -- ? )
14 dup [ -6 shift BIN: 10 number= ] when ; inline
16 : append-nums ( stream byte -- stream char )
17 over stream-read1 dup starts-2?
18 [ swap 6 shift swap BIN: 111111 bitand bitor ]
19 [ 2drop replacement-char ] if ; inline
21 : double ( stream byte -- stream char )
22 BIN: 11111 bitand append-nums ; inline
24 : triple ( stream byte -- stream char )
25 BIN: 1111 bitand append-nums append-nums ; inline
27 : quadruple ( stream byte -- stream char )
28 BIN: 111 bitand append-nums append-nums append-nums ; inline
30 : begin-utf8 ( stream byte -- stream char )
32 { [ dup -7 shift zero? ] [ ] }
33 { [ dup -5 shift BIN: 110 = ] [ double ] }
34 { [ dup -4 shift BIN: 1110 = ] [ triple ] }
35 { [ dup -3 shift BIN: 11110 = ] [ quadruple ] }
36 [ drop replacement-char ]
39 : decode-utf8 ( stream -- char/f )
40 dup stream-read1 dup [ begin-utf8 ] when nip ; inline
47 : encoded ( stream char -- )
48 BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ;
50 : char>utf8 ( stream char -- )
52 { [ dup -7 shift zero? ] [ swap stream-write1 ] }
53 { [ dup -11 shift zero? ] [
54 2dup -6 shift BIN: 11000000 bitor swap stream-write1
57 { [ dup -16 shift zero? ] [
58 2dup -12 shift BIN: 11100000 bitor swap stream-write1
63 2dup -18 shift BIN: 11110000 bitor swap stream-write1
64 2dup -12 shift encoded
75 : code-point-length ( n -- x )
77 { [ dup 0 7 between? ] [ 1 ] }
78 { [ dup 8 11 between? ] [ 2 ] }
79 { [ dup 12 16 between? ] [ 3 ] }
80 { [ dup 17 21 between? ] [ 4 ] }
83 : code-point-offsets ( string -- indices )
84 0 [ code-point-length + ] accumulate swap suffix ;
86 : utf8-index> ( n string -- n' )
87 code-point-offsets [ <= ] with find drop ;
89 : >utf8-index ( n string -- n' )
90 code-point-offsets nth ;