]> gitweb.factorcode.org Git - factor.git/blob - core/io/encodings/utf8/utf8.factor
factor: Rename GENERIC# to GENERIC#:.
[factor.git] / core / io / encodings / utf8 / utf8.factor
1 ! Copyright (C) 2006, 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors combinators io io.encodings
4 io.encodings.private kernel math math.order sequences strings ;
5 IN: io.encodings.utf8
6
7 ! Decoding UTF-8
8
9 SINGLETON: utf8
10
11 <PRIVATE
12
13 : starts-2? ( char -- ? )
14     dup [ -6 shift 0b10 number= ] when ; inline
15
16 : append-nums ( stream byte -- stream char )
17     over stream-read1 dup starts-2?
18     [ [ 6 shift ] dip 0b111111 bitand bitor ]
19     [ 2drop replacement-char ] if ; inline
20
21 : minimum-code-point ( char minimum -- char )
22     over > [ drop replacement-char ] when ; inline
23
24 : maximum-code-point ( char maximum -- char )
25     over < [ drop replacement-char ] when ; inline
26
27 : double ( stream byte -- stream char )
28     0b11111 bitand append-nums
29     0x80 minimum-code-point ; inline
30
31 : triple ( stream byte -- stream char )
32     0b1111 bitand append-nums append-nums
33     0x800 minimum-code-point ; inline
34
35 : quadruple ( stream byte -- stream char )
36     0b111 bitand append-nums append-nums append-nums
37     0x10000 minimum-code-point
38     0x10FFFF maximum-code-point ; inline
39
40 : begin-utf8 ( stream byte -- stream char )
41     dup 127 > [
42         {
43             { [ dup -5 shift 0b110 = ] [ double ] }
44             { [ dup -4 shift 0b1110 = ] [ triple ] }
45             { [ dup -3 shift 0b11110 = ] [ quadruple ] }
46             [ drop replacement-char ]
47         } cond
48     ] when ; inline
49
50 : decode-utf8 ( stream -- char/f )
51     dup stream-read1 dup [ begin-utf8 ] when nip ; inline
52
53 M: utf8 decode-char
54     drop decode-utf8 ; inline
55
56 M: utf8 decode-until (decode-until) ;
57
58 ! Encoding UTF-8
59
60 : encoded ( stream char -- )
61     0b111111 bitand 0b10000000 bitor swap stream-write1 ; inline
62
63 : char>utf8 ( char stream -- )
64     over 127 <= [ stream-write1 ] [
65         swap {
66             { [ dup -11 shift zero? ] [
67                 2dup -6 shift 0b11000000 bitor swap stream-write1
68                 encoded
69             ] }
70             { [ dup -16 shift zero? ] [
71                 2dup -12 shift 0b11100000 bitor swap stream-write1
72                 2dup -6 shift encoded
73                 encoded
74             ] }
75             [
76                 2dup -18 shift 0b11110000 bitor swap stream-write1
77                 2dup -12 shift encoded
78                 2dup -6 shift encoded
79                 encoded
80             ]
81         } cond
82     ] if ; inline
83
84 M: utf8 encode-char
85     drop char>utf8 ;
86
87 GENERIC#: encode-string-utf8 1 ( string stream -- )
88
89 M: object encode-string-utf8
90     [ char>utf8 ] curry each ; inline
91
92 M: string encode-string-utf8
93     over aux>>
94     [ call-next-method ]
95     [ [ string>byte-array-fast ] dip stream-write ] if ; inline
96
97 M: utf8 encode-string drop encode-string-utf8 ;
98
99 PRIVATE>
100
101 : code-point-length ( n -- x )
102     [ 1 ] [
103         log2 {
104             { [ dup 0 6 between? ] [ 1 ] }
105             { [ dup 7 10 between? ] [ 2 ] }
106             { [ dup 11 15 between? ] [ 3 ] }
107             { [ dup 16 20 between? ] [ 4 ] }
108         } cond nip
109     ] if-zero ;
110
111 : code-point-offsets ( string -- indices )
112     0 [ code-point-length + ] accumulate swap suffix ;
113
114 : utf8-index> ( n string -- n' )
115     code-point-offsets [ <= ] with find drop ;
116
117 : >utf8-index ( n string -- n' )
118     code-point-offsets nth ;