--- /dev/null
+Daniel Ehrenberg
--- /dev/null
+UTF16 encoding/decoding
--- /dev/null
+USING: help.markup help.syntax io.encodings strings ;
+IN: io.encodings.utf16
+
+ARTICLE: "io.encodings.utf16" "UTF-16"
+"The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:"
+{ $subsection utf16 }
+{ $subsection utf16le }
+{ $subsection utf16be }
+{ $subsection utf16n } ;
+
+ABOUT: "io.encodings.utf16"
+
+HELP: utf16le
+{ $class-description "The encoding descriptor for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
+{ $see-also "encodings-introduction" } ;
+
+HELP: utf16be
+{ $class-description "The encoding descriptor for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
+{ $see-also "encodings-introduction" } ;
+
+HELP: utf16
+{ $class-description "The encoding descriptor for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." }
+{ $see-also "encodings-introduction" } ;
+
+HELP: utf16n
+{ $class-description "The encoding descriptor for UTF-16 without a byte order mark in native endian order. This is useful mostly for FFI calls which take input of strings of the type wchar_t*" }
+{ $see-also "encodings-introduction" } ;
+
+{ utf16 utf16le utf16be utf16n } related-words
--- /dev/null
+USING: kernel tools.test io.encodings.utf16 arrays sbufs
+io.streams.byte-array sequences io.encodings io unicode
+io.encodings.string alien.c-types accessors classes ;
+IN: io.encodings.utf16.tests
+
+[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test
+[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode >array ] unit-test
+
+[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode >array ] unit-test
+
+[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode >array ] unit-test
+[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
+[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
+
+[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
+
+[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode >array ] unit-test
+[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test
+
+[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test
+
+: correct-endian
+ code>> class little-endian? [ utf16le = ] [ utf16be = ] if ;
+
+[ t ] [ B{ } utf16n <byte-reader> correct-endian ] unit-test
+[ t ] [ utf16n <byte-writer> correct-endian ] unit-test
--- /dev/null
+! Copyright (C) 2006, 2008 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: math kernel sequences sbufs vectors namespaces io.binary
+io.encodings combinators splitting io byte-arrays inspector
+alien.c-types ;
+IN: io.encodings.utf16
+
+TUPLE: utf16be ;
+
+TUPLE: utf16le ;
+
+TUPLE: utf16 ;
+
+TUPLE: utf16n ;
+
+<PRIVATE
+
+! UTF-16BE decoding
+
+: append-nums ( byte ch -- ch )
+ over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
+
+: double-be ( stream byte -- stream char )
+ over stream-read1 swap append-nums ;
+
+: quad-be ( stream byte -- stream char )
+ double-be over stream-read1 [
+ dup -2 shift BIN: 110111 number= [
+ >r 2 shift r> BIN: 11 bitand bitor
+ over stream-read1 swap append-nums HEX: 10000 +
+ ] [ 2drop dup stream-read1 drop replacement-char ] if
+ ] when* ;
+
+: ignore ( stream -- stream char )
+ dup stream-read1 drop replacement-char ;
+
+: begin-utf16be ( stream byte -- stream char )
+ dup -3 shift BIN: 11011 number= [
+ dup BIN: 00000100 bitand zero?
+ [ BIN: 11 bitand quad-be ]
+ [ drop ignore ] if
+ ] [ double-be ] if ;
+
+M: utf16be decode-char
+ drop dup stream-read1 dup [ begin-utf16be ] when nip ;
+
+! UTF-16LE decoding
+
+: quad-le ( stream ch -- stream char )
+ over stream-read1 swap 10 shift bitor
+ over stream-read1 dup -2 shift BIN: 110111 = [
+ BIN: 11 bitand append-nums HEX: 10000 +
+ ] [ 2drop replacement-char ] if ;
+
+: double-le ( stream byte1 byte2 -- stream char )
+ dup -3 shift BIN: 11011 = [
+ dup BIN: 100 bitand 0 number=
+ [ BIN: 11 bitand 8 shift bitor quad-le ]
+ [ 2drop replacement-char ] if
+ ] [ append-nums ] if ;
+
+: begin-utf16le ( stream byte -- stream char )
+ over stream-read1 [ double-le ] [ drop replacement-char ] if* ;
+
+M: utf16le decode-char
+ drop dup stream-read1 dup [ begin-utf16le ] when nip ;
+
+! UTF-16LE/BE encoding
+
+: encode-first ( char -- byte1 byte2 )
+ -10 shift
+ dup -8 shift BIN: 11011000 bitor
+ swap HEX: FF bitand ;
+
+: encode-second ( char -- byte3 byte4 )
+ BIN: 1111111111 bitand
+ dup -8 shift BIN: 11011100 bitor
+ swap BIN: 11111111 bitand ;
+
+: stream-write2 ( stream char1 char2 -- )
+ rot [ stream-write1 ] curry bi@ ;
+
+: char>utf16be ( stream char -- )
+ dup HEX: FFFF > [
+ HEX: 10000 -
+ 2dup encode-first stream-write2
+ encode-second stream-write2
+ ] [ h>b/b swap stream-write2 ] if ;
+
+M: utf16be encode-char ( char stream encoding -- )
+ drop swap char>utf16be ;
+
+: char>utf16le ( char stream -- )
+ dup HEX: FFFF > [
+ HEX: 10000 -
+ 2dup encode-first swap stream-write2
+ encode-second swap stream-write2
+ ] [ h>b/b stream-write2 ] if ;
+
+M: utf16le encode-char ( char stream encoding -- )
+ drop swap char>utf16le ;
+
+! UTF-16
+
+: bom-le B{ HEX: ff HEX: fe } ; inline
+
+: bom-be B{ HEX: fe HEX: ff } ; inline
+
+: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
+
+: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
+
+TUPLE: missing-bom ;
+M: missing-bom summary drop "The BOM for a UTF-16 stream was missing" ;
+
+: bom>le/be ( bom -- le/be )
+ dup bom-le sequence= [ drop utf16le ] [
+ bom-be sequence= [ utf16be ] [ missing-bom ] if
+ ] if ;
+
+M: utf16 <decoder> ( stream utf16 -- decoder )
+ drop 2 over stream-read bom>le/be <decoder> ;
+
+M: utf16 <encoder> ( stream utf16 -- encoder )
+ drop bom-le over stream-write utf16le <encoder> ;
+
+! Native-order UTF-16
+
+: utf16n ( -- descriptor )
+ little-endian? utf16le utf16be ? ; foldable
+
+M: utf16n <decoder> drop utf16n <decoder> ;
+
+M: utf16n <encoder> drop utf16n <encoder> ;
+
+PRIVATE>
pick 0.0 [
swap >r >r 2dup r> (eval-A-times-u) r> +
] reduce nip
- ] F{ } map-as { float-array } declare 2nip ; inline
+ ] F{ } map-as 2nip ; inline
: (eval-At-times-u) ( u i j -- x )
tuck swap eval-A >r swap nth-unsafe r> * ; inline
pick 0.0 [
swap >r >r 2dup r> (eval-At-times-u) r> +
] reduce nip
- ] F{ } map-as { float-array } declare 2nip ; inline
+ ] F{ } map-as 2nip ; inline
: eval-AtA-times-u ( n u -- seq )
dupd eval-A-times-u eval-At-times-u ; inline
! See http://factorcode.org/license.txt for BSD license.\r
USING: arrays kernel kernel.private math sequences\r
sequences.private growable bit-arrays prettyprint.backend\r
-parser ;\r
+parser accessors ;\r
IN: bit-vectors\r
\r
TUPLE: bit-vector underlying fill ;\r
\r
INSTANCE: bit-vector growable\r
\r
-: ?V \ } [ >bit-vector ] parse-literal ; parsing\r
+: ?V{ \ } [ >bit-vector ] parse-literal ; parsing\r
\r
M: bit-vector >pprint-sequence ;\r
\r
! See http://factorcode.org/license.txt for BSD license.\r
USING: arrays kernel kernel.private math sequences\r
sequences.private growable float-arrays prettyprint.backend\r
-parser ;\r
+parser accessors ;\r
IN: float-vectors\r
\r
TUPLE: float-vector underlying fill ;\r
+++ /dev/null
-Daniel Ehrenberg
+++ /dev/null
-UTF16 encoding/decoding
+++ /dev/null
-USING: help.markup help.syntax io.encodings strings ;
-IN: io.encodings.utf16
-
-ARTICLE: "io.encodings.utf16" "UTF-16"
-"The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:"
-{ $subsection utf16 }
-{ $subsection utf16le }
-{ $subsection utf16be }
-{ $subsection utf16n } ;
-
-ABOUT: "io.encodings.utf16"
-
-HELP: utf16le
-{ $class-description "The encoding descriptor for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
-{ $see-also "encodings-introduction" } ;
-
-HELP: utf16be
-{ $class-description "The encoding descriptor for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
-{ $see-also "encodings-introduction" } ;
-
-HELP: utf16
-{ $class-description "The encoding descriptor for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." }
-{ $see-also "encodings-introduction" } ;
-
-HELP: utf16n
-{ $class-description "The encoding descriptor for UTF-16 without a byte order mark in native endian order. This is useful mostly for FFI calls which take input of strings of the type wchar_t*" }
-{ $see-also "encodings-introduction" } ;
-
-{ utf16 utf16le utf16be utf16n } related-words
+++ /dev/null
-USING: kernel tools.test io.encodings.utf16 arrays sbufs
-io.streams.byte-array sequences io.encodings io unicode
-io.encodings.string alien.c-types accessors classes ;
-IN: io.encodings.utf16.tests
-
-[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test
-[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test
-[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode >array ] unit-test
-[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode >array ] unit-test
-
-[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode >array ] unit-test
-
-[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode >array ] unit-test
-[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
-[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
-[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
-[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
-
-[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
-
-[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode >array ] unit-test
-[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test
-
-[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test
-
-: correct-endian
- code>> class little-endian? [ utf16le = ] [ utf16be = ] if ;
-
-[ t ] [ B{ } utf16n <byte-reader> correct-endian ] unit-test
-[ t ] [ utf16n <byte-writer> correct-endian ] unit-test
+++ /dev/null
-! Copyright (C) 2006, 2008 Daniel Ehrenberg.
-! See http://factorcode.org/license.txt for BSD license.
-USING: math kernel sequences sbufs vectors namespaces io.binary
-io.encodings combinators splitting io byte-arrays inspector
-alien.c-types ;
-IN: io.encodings.utf16
-
-TUPLE: utf16be ;
-
-TUPLE: utf16le ;
-
-TUPLE: utf16 ;
-
-TUPLE: utf16n ;
-
-<PRIVATE
-
-! UTF-16BE decoding
-
-: append-nums ( byte ch -- ch )
- over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
-
-: double-be ( stream byte -- stream char )
- over stream-read1 swap append-nums ;
-
-: quad-be ( stream byte -- stream char )
- double-be over stream-read1 [
- dup -2 shift BIN: 110111 number= [
- >r 2 shift r> BIN: 11 bitand bitor
- over stream-read1 swap append-nums HEX: 10000 +
- ] [ 2drop dup stream-read1 drop replacement-char ] if
- ] when* ;
-
-: ignore ( stream -- stream char )
- dup stream-read1 drop replacement-char ;
-
-: begin-utf16be ( stream byte -- stream char )
- dup -3 shift BIN: 11011 number= [
- dup BIN: 00000100 bitand zero?
- [ BIN: 11 bitand quad-be ]
- [ drop ignore ] if
- ] [ double-be ] if ;
-
-M: utf16be decode-char
- drop dup stream-read1 dup [ begin-utf16be ] when nip ;
-
-! UTF-16LE decoding
-
-: quad-le ( stream ch -- stream char )
- over stream-read1 swap 10 shift bitor
- over stream-read1 dup -2 shift BIN: 110111 = [
- BIN: 11 bitand append-nums HEX: 10000 +
- ] [ 2drop replacement-char ] if ;
-
-: double-le ( stream byte1 byte2 -- stream char )
- dup -3 shift BIN: 11011 = [
- dup BIN: 100 bitand 0 number=
- [ BIN: 11 bitand 8 shift bitor quad-le ]
- [ 2drop replacement-char ] if
- ] [ append-nums ] if ;
-
-: begin-utf16le ( stream byte -- stream char )
- over stream-read1 [ double-le ] [ drop replacement-char ] if* ;
-
-M: utf16le decode-char
- drop dup stream-read1 dup [ begin-utf16le ] when nip ;
-
-! UTF-16LE/BE encoding
-
-: encode-first ( char -- byte1 byte2 )
- -10 shift
- dup -8 shift BIN: 11011000 bitor
- swap HEX: FF bitand ;
-
-: encode-second ( char -- byte3 byte4 )
- BIN: 1111111111 bitand
- dup -8 shift BIN: 11011100 bitor
- swap BIN: 11111111 bitand ;
-
-: stream-write2 ( stream char1 char2 -- )
- rot [ stream-write1 ] curry bi@ ;
-
-: char>utf16be ( stream char -- )
- dup HEX: FFFF > [
- HEX: 10000 -
- 2dup encode-first stream-write2
- encode-second stream-write2
- ] [ h>b/b swap stream-write2 ] if ;
-
-M: utf16be encode-char ( char stream encoding -- )
- drop swap char>utf16be ;
-
-: char>utf16le ( char stream -- )
- dup HEX: FFFF > [
- HEX: 10000 -
- 2dup encode-first swap stream-write2
- encode-second swap stream-write2
- ] [ h>b/b stream-write2 ] if ;
-
-M: utf16le encode-char ( char stream encoding -- )
- drop swap char>utf16le ;
-
-! UTF-16
-
-: bom-le B{ HEX: ff HEX: fe } ; inline
-
-: bom-be B{ HEX: fe HEX: ff } ; inline
-
-: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
-
-: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
-
-TUPLE: missing-bom ;
-M: missing-bom summary drop "The BOM for a UTF-16 stream was missing" ;
-
-: bom>le/be ( bom -- le/be )
- dup bom-le sequence= [ drop utf16le ] [
- bom-be sequence= [ utf16be ] [ missing-bom ] if
- ] if ;
-
-M: utf16 <decoder> ( stream utf16 -- decoder )
- drop 2 over stream-read bom>le/be <decoder> ;
-
-M: utf16 <encoder> ( stream utf16 -- encoder )
- drop bom-le over stream-write utf16le <encoder> ;
-
-! Native-order UTF-16
-
-: native-utf16 ( -- descriptor )
- little-endian? utf16le utf16be ? ;
-
-M: utf16n <decoder> drop native-utf16 <decoder> ;
-
-M: utf16n <encoder> drop native-utf16 <encoder> ;
-
-PRIVATE>