! Copyright (C) 2006, 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math math.order kernel sequences sbufs vectors growable io continuations namespaces io.encodings combinators strings ; IN: io.encodings.utf8 ! Decoding UTF-8 SINGLETON: utf8 utf8 ( stream char -- ) { { [ dup -7 shift zero? ] [ swap stream-write1 ] } { [ dup -11 shift zero? ] [ 2dup -6 shift BIN: 11000000 bitor swap stream-write1 encoded ] } { [ dup -16 shift zero? ] [ 2dup -12 shift BIN: 11100000 bitor swap stream-write1 2dup -6 shift encoded encoded ] } [ 2dup -18 shift BIN: 11110000 bitor swap stream-write1 2dup -12 shift encoded 2dup -6 shift encoded encoded ] } cond ; M: utf8 encode-char drop swap char>utf8 ; PRIVATE> : code-point-length ( n -- x ) log2 { { [ dup 0 7 between? ] [ 1 ] } { [ dup 8 11 between? ] [ 2 ] } { [ dup 12 16 between? ] [ 3 ] } { [ dup 17 21 between? ] [ 4 ] } } cond nip ; : code-point-offsets ( string -- indices ) 0 [ code-point-length + ] accumulate swap suffix ; : utf8-index> ( n string -- n' ) code-point-offsets [ <= ] with find drop ; : >utf8-index ( n string -- n' ) code-point-offsets nth ;