! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.data sequences sequences.next namespaces make unicode.syntax
-unicode.normalize math unicode.categories combinators unicode.syntax
-assocs strings splitting kernel accessors unicode.breaks fry ;
+USING: unicode.data sequences sequences.next namespaces
+sbufs make unicode.syntax unicode.normalize math hints
+unicode.categories combinators unicode.syntax assocs
+strings splitting kernel accessors unicode.breaks fry locals ;
+QUALIFIED: ascii
IN: unicode.case
<PRIVATE
-: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
+: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ; inline
-: ch>lower ( ch -- lower ) simple-lower at-default ;
-: ch>upper ( ch -- upper ) simple-upper at-default ;
-: ch>title ( ch -- title ) simple-title at-default ;
+: ch>lower ( ch -- lower ) simple-lower at-default ; inline
+: ch>upper ( ch -- upper ) simple-upper at-default ; inline
+: ch>title ( ch -- title ) simple-title at-default ; inline
PRIVATE>
SYMBOL: locale ! Just casing locale, or overall?
[ dup ] swap '[ _ split1-slice swap ] [ ] produce nip ;
: replace ( old new str -- newstr )
- [ split-subseq ] dip join ;
+ [ split-subseq ] dip join ; inline
: i-dot? ( -- ? )
locale get { "tr" "az" } member? ;
[ [ "" ] [
dup first mark-above?
[ CHAR: combining-dot-above prefix ] when
- ] if-empty ] with-rest ;
+ ] if-empty ] with-rest ; inline
: lithuanian>lower ( string -- lower )
"i" split add-dots "i" join
- "j" split add-dots "i" join ;
+ "j" split add-dots "i" join ; inline
: turk>upper ( string -- upper-i )
- "i" "I\u000307" replace ;
+ "i" "I\u000307" replace ; inline
: turk>lower ( string -- lower-i )
"I\u000307" "i" replace
- "I" "\u000131" replace ;
+ "I" "\u000131" replace ; inline
: fix-sigma-end ( string -- string )
[ "" ] [
dup peek CHAR: greek-small-letter-sigma =
[ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when
- ] if-empty ;
+ ] if-empty ; inline
: sigma-map ( string -- string )
{ CHAR: greek-capital-letter-sigma } split [ [
CHAR: greek-small-letter-final-sigma
CHAR: greek-small-letter-sigma ? prefix
] if-empty
- ] map ] with-rest concat fix-sigma-end ;
+ ] map ] with-rest concat fix-sigma-end ; inline
: final-sigma ( string -- string )
CHAR: greek-capital-letter-sigma
- over member? [ sigma-map ] when ;
+ over member? [ sigma-map ] when
+ "" like ; inline
-: map-case ( string string-quot char-quot -- case )
- [
- [
- [ dup special-casing at ] 2dip
- [ [ % ] compose ] [ [ , ] compose ] bi* ?if
- ] 2curry each
- ] "" make ; inline
+:: map-case ( string string-quot char-quot -- case )
+ string length <sbuf> :> out
+ string [
+ dup special-casing at
+ [ string-quot call out push-all ]
+ [ char-quot call out push ] ?if
+ ] each out "" like ; inline
PRIVATE>
i-dot? [ turk>lower ] when final-sigma
[ lower>> ] [ ch>lower ] map-case ;
+HINTS: >lower string ;
+
: >upper ( string -- upper )
i-dot? [ turk>upper ] when
[ upper>> ] [ ch>upper ] map-case ;
+HINTS: >upper string ;
+
<PRIVATE
: (>title) ( string -- title )
i-dot? [ turk>upper ] when
- [ title>> ] [ ch>title ] map-case ;
+ [ title>> ] [ ch>title ] map-case ; inline
: title-word ( string -- title )
- unclip 1string [ >lower ] [ (>title) ] bi* prepend ;
+ unclip 1string [ >lower ] [ (>title) ] bi* prepend ; inline
PRIVATE>
: >title ( string -- title )
final-sigma >words [ title-word ] map concat ;
+HINTS: >title string ;
+
: >case-fold ( string -- fold )
>upper >lower ;
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: sequences namespaces make unicode.data kernel math arrays
+USING: ascii sequences namespaces make unicode.data kernel math arrays
locals sorting.insertion accessors assocs math.order combinators
-unicode.syntax strings sbufs ;
+unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
IN: unicode.normalize
<PRIVATE
CONSTANT: final-count 28
: ?between? ( n/f from to -- ? )
- pick [ between? ] [ 3drop f ] if ;
+ pick [ between? ] [ 3drop f ] if ; inline
-: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
-: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
+: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; inline
+: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; inline
! These numbers come from UAX 29
: initial? ( ch -- ? )
- dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ;
-: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
-: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
+ dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ; inline
+: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; inline
+: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; inline
: hangul>jamo ( hangul -- jamo-string )
hangul-base - final-count /mod final-base +
: reorder-slice ( string start -- slice done? )
2dup swap [ non-starter? not ] find-from drop
- [ [ over length ] unless* rot <slice> ] keep not ;
+ [ [ over length ] unless* rot <slice> ] keep not ; inline
: reorder-next ( string i -- new-i done? )
over [ non-starter? ] find-from drop [
reorder-slice
[ dup [ combining-class ] insertion-sort to>> ] dip
- ] [ length t ] if* ;
+ ] [ length t ] if* ; inline
: reorder-loop ( string start -- )
- dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
+ dupd reorder-next [ 2drop ] [ reorder-loop ] if ; inline recursive
: reorder ( string -- )
0 reorder-loop ;
over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
:: decompose ( string quot -- decomposed )
- [let | out [ string length <sbuf> ] |
- string [
+ string length <sbuf> :> out
+ string [
+ >fixnum dup ascii? [ out push ] [
dup hangul? [ hangul>jamo out push-all ]
[ dup quot call [ out push-all ] [ out push ] ?if ] if
- ] each out >string
- ] dup reorder ;
+ ] if
+ ] each
+ out "" like dup reorder ; inline
: with-string ( str quot -- str )
over aux>> [ call ] [ drop ] if ; inline
: (nfd) ( string -- nfd )
[ canonical-entry ] decompose ;
+HINTS: (nfd) string ;
+
: (nfkd) ( string -- nfkd )
[ compatibility-entry ] decompose ;
+HINTS: (nfkd) string ;
+
PRIVATE>
: nfd ( string -- nfd )
0 over ?nth non-starter?
[ length dupd reorder-back ] [ drop ] if ;
+HINTS: string-append string string ;
+
<PRIVATE
! Normalization -- Composition
-SYMBOL: main-str
-SYMBOL: ind
-SYMBOL: after
-SYMBOL: char
-
-: get-str ( i -- ch ) ind get + main-str get ?nth ;
-: current ( -- ch ) 0 get-str ;
-: to ( -- ) ind inc ;
-
-: initial-medial? ( -- ? )
- current initial? [ 1 get-str medial? ] [ f ] if ;
-
-: --final? ( -- ? )
- 2 get-str final? ;
-
-: imf, ( -- )
- current to current to current jamo>hangul , ;
-: im, ( -- )
- current to current final-base jamo>hangul , ;
-
-: compose-jamo ( -- )
- initial-medial? [
- --final? [ imf, ] [ im, ] if
- ] [ current , ] if to ;
-
-: pass-combining ( -- )
- current non-starter? [ current , to pass-combining ] when ;
-
-:: try-compose ( last-class new-char current-class -- new-class )
- last-class current-class = [ new-char after get push last-class ] [
- char get new-char combine-chars
- [ char set last-class ]
- [ new-char after get push current-class ] if*
+: initial-medial? ( str i -- ? )
+ { [ swap nth initial? ] [ 1+ swap ?nth medial? ] } 2&& ;
+
+: --final? ( str i -- ? )
+ 2 + swap ?nth final? ;
+
+: imf, ( str i -- str i )
+ [ tail-slice first3 jamo>hangul , ]
+ [ 3 + ] 2bi ;
+
+: im, ( str i -- str i )
+ [ tail-slice first2 final-base jamo>hangul , ]
+ [ 2 + ] 2bi ;
+
+: compose-jamo ( str i -- str i )
+ 2dup initial-medial? [
+ 2dup --final? [ imf, ] [ im, ] if
+ ] [ 2dup swap nth , 1+ ] if ;
+
+: pass-combining ( str -- str i )
+ dup [ non-starter? not ] find drop
+ [ dup length ] unless*
+ 2dup head-slice % ;
+
+TUPLE: compose-state i str char after last-class ;
+
+: get-str ( state i -- ch )
+ swap [ i>> + ] [ str>> ] bi ?nth ;
+: current ( state -- ch ) 0 get-str ;
+: to ( state -- state ) [ 1+ ] change-i ;
+: push-after ( ch state -- state ) [ ?push ] change-after ;
+
+:: try-compose ( state new-char current-class -- state )
+ state last-class>> current-class =
+ [ new-char state push-after ] [
+ state char>> new-char combine-chars
+ [ state swap >>char ] [
+ new-char state push-after
+ current-class >>last-class
+ ] if*
] if ;
DEFER: compose-iter
-: try-noncombining ( char -- )
- char get swap combine-chars
- [ char set to f compose-iter ] when* ;
+: try-noncombining ( char state -- state )
+ tuck char>> swap combine-chars
+ [ >>char to f >>last-class compose-iter ] when* ;
-: compose-iter ( last-class -- )
- current [
+: compose-iter ( state -- state )
+ dup current [
dup combining-class {
- { f [ 2drop ] }
- { 0 [ swap [ drop ] [ try-noncombining ] if ] }
+ { f [ drop ] }
+ { 0 [
+ over last-class>>
+ [ drop ] [ swap try-noncombining ] if ] }
[ try-compose to compose-iter ]
} case
- ] [ drop ] if* ;
-
-: ?new-after ( -- )
- after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
-
-: compose-combining ( ch -- )
- char set to ?new-after
- f compose-iter
- char get , after get % ;
+ ] when* ;
-: (compose) ( -- )
- current [
- dup jamo? [ drop compose-jamo ] [
- 1 get-str combining-class
- [ compose-combining ] [ , to ] if
+: compose-combining ( ch str i -- str i )
+ compose-state new
+ swap >>i
+ swap >>str
+ swap >>char
+ compose-iter
+ { [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ;
+
+:: (compose) ( str i -- )
+ i str ?nth [
+ dup jamo? [ drop str i compose-jamo ] [
+ i 1+ str ?nth combining-class
+ [ str i 1+ compose-combining ] [ , str i 1+ ] if
] if (compose)
] when* ;
: combine ( str -- comp )
- [
- main-str set
- 0 ind set
- SBUF" " clone after set
- pass-combining (compose)
- ] "" make ;
+ [ pass-combining (compose) ] "" make ;
PRIVATE>