! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: xml xml.data kernel io io.encodings interval-maps splitting fry
-math.parser sequences combinators assocs locals accessors math
-arrays values io.encodings.ascii ascii io.files biassocs math.order
-combinators.short-circuit io.binary io.encodings.iana ;
+math.parser sequences combinators assocs locals accessors math arrays
+byte-arrays values io.encodings.ascii ascii io.files biassocs
+math.order combinators.short-circuit io.binary io.encodings.iana ;
IN: io.encodings.chinese
SINGLETON: gb18030
! Resource file from:
! http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
+! Algorithms from:
+! http://www-128.ibm.com/developerworks/library/u-china.html
+
+: linear ( bytes -- num )
+ ! This hard-codes bMin and bMax
+ reverse first4
+ 10 * + 126 * + 10 * + ; foldable
+
TUPLE: range ufirst ulast bfirst blast ;
: b>byte-array ( string -- byte-array )
{
[ "uFirst" attr hex> ]
[ "uLast" attr hex> ]
- [ "bFirst" attr b>byte-array ]
- [ "bLast" attr b>byte-array ]
+ [ "bFirst" attr b>byte-array linear ]
+ [ "bLast" attr b>byte-array linear ]
} cleave range boa
] dip push ;
] each-element mapping ranges
] ;
-! Algorithms from:
-! http://www-128.ibm.com/developerworks/library/u-china.html
-
-: linear ( bytes -- num )
- ! This hard-codes bMin and bMax
- reverse first4
- 10 * + 126 * + 10 * + ;
-
: unlinear ( num -- bytes )
B{ HEX: 81 HEX: 30 HEX: 81 HEX: 30 } linear -
- 10 /mod swap [ HEX: 30 + ] dip
- 126 /mod swap [ HEX: 81 + ] dip
- 10 /mod swap [ HEX: 30 + ] dip
+ 10 /mod HEX: 30 + swap
+ 126 /mod HEX: 81 + swap
+ 10 /mod HEX: 30 + swap
HEX: 81 +
- B{ } 4sequence reverse ;
+ 4byte-array dup reverse-here ;
: >interval-map-by ( start-quot end-quot value-quot seq -- interval-map )
'[ _ [ @ 2array ] _ tri ] { } map>assoc <interval-map> ; inline
[ ufirst>> ] [ ulast>> ] [ ] >interval-map-by ;
: ranges-gb>u ( ranges -- interval-map )
- [ bfirst>> linear ] [ blast>> linear ] [ ] >interval-map-by ;
+ [ bfirst>> ] [ blast>> ] [ ] >interval-map-by ;
VALUE: gb>u
VALUE: u>gb
: lookup-range ( char -- byte-array )
dup u>gb interval-at [
- [ ufirst>> - ] [ bfirst>> linear ] bi + unlinear
+ [ ufirst>> - ] [ bfirst>> ] bi + unlinear
] [ encode-error ] if* ;
M: gb18030 encode-char ( char stream encoding -- )
: decode-quad ( byte-array -- char )
dup mapping value-at [ ] [
linear dup gb>u interval-at [
- [ bfirst>> linear - ] [ ufirst>> ] bi +
+ [ bfirst>> - ] [ ufirst>> ] bi +
] [ drop replacement-char ] if*
] ?if ;
: four-byte ( stream byte1 byte2 -- char )
rot 2 swap stream-read dup last-bytes?
- [ first2 B{ } 4sequence decode-quad ]
+ [ first2 4byte-array decode-quad ]
[ 3drop replacement-char ] if ;
: two-byte ( stream byte -- char )
over stream-read1 {
{ [ dup not ] [ 3drop replacement-char ] }
- { [ dup second-byte? ] [ B{ } 2sequence mapping value-at nip ] }
+ { [ dup second-byte? ] [ 2byte-array mapping value-at nip ] }
{ [ dup quad-2/4? ] [ four-byte ] }
[ 3drop replacement-char ]
} cond ;
M: gb18030 decode-char ( stream encoding -- char )
drop dup stream-read1 {
{ [ dup not ] [ 2drop f ] }
- { [ dup ascii? ] [ nip 1array B{ } like mapping value-at ] }
+ { [ dup ascii? ] [ nip 1byte-array mapping value-at ] }
{ [ dup quad-1/3? ] [ two-byte ] }
[ 2drop replacement-char ]
} cond ;
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: sequences kernel io io.files combinators.short-circuit
-math.order values assocs io.encodings io.binary fry strings
-math io.encodings.ascii arrays accessors splitting math.parser
-biassocs io.encodings.iana ;
+math.order values assocs io.encodings io.binary fry strings math
+io.encodings.ascii arrays byte-arrays accessors splitting
+math.parser biassocs io.encodings.iana ;
IN: io.encodings.japanese
SINGLETON: shift-jis
{ [ 0 HEX: 7F between? ] [ HEX: A1 HEX: DF between? ] } 1|| ;
: write-halfword ( stream halfword -- )
- h>b/b swap B{ } 2sequence swap stream-write ;
+ h>b/b swap 2byte-array swap stream-write ;
M: jis encode-char
swapd ch>jis