combinators splitting sequences math.parser io.files io assocs
arrays namespaces make math.ranges unicode.normalize
unicode.normalize.private values io.encodings.ascii
-unicode.syntax unicode.data compiler.units fry
+unicode.data compiler.units fry unicode.categories.syntax
alien.syntax sets accessors interval-maps memoize locals words
simple-flat-file ;
IN: unicode.breaks
[ drop Control ]
} case ;
-CATEGORY: (extend) Me Mn ;
-: extend? ( ch -- ? )
- { [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
+CATEGORY: extend
+ Me Mn |
+ "Other_Grapheme_Extend" property? ;
: loe? ( ch -- ? )
"Logical_Order_Exception" property? ;
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: unicode.data sequences namespaces
-sbufs make unicode.syntax unicode.normalize math hints
-unicode.categories combinators unicode.syntax assocs combinators.short-circuit
+sbufs make unicode.normalize math hints
+unicode.categories combinators assocs combinators.short-circuit
strings splitting kernel accessors unicode.breaks fry locals ;
QUALIFIED: ascii
IN: unicode.case
HELP: alpha
{ $class-description "The class of alphanumeric characters." } ;
+HELP: math
+{ $class-description "The class of Unicode math characters." } ;
+
HELP: blank
{ $class-description "The class of whitespace characters." } ;
{ $subsection uncased }
{ $subsection uncased? }
{ $subsection character }
-{ $subsection character? } ;
+{ $subsection character? }
+{ $subsection math }
+{ $subsection math? } ;
ABOUT: "unicode.categories"
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.syntax ;
+USING: unicode.categories.syntax sequences unicode.data ;
IN: unicode.categories
-CATEGORY: blank Zs Zl Zp \r\n ;
-CATEGORY: letter Ll ;
-CATEGORY: LETTER Lu ;
-CATEGORY: Letter Lu Ll Lt Lm Lo ;
+CATEGORY: blank Zs Zl Zp | "\r\n" member? ;
+CATEGORY: letter Ll | "Other_Lowercase" property? ;
+CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
+CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
CATEGORY: digit Nd Nl No ;
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
-CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
+CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
CATEGORY: control Cc ;
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
CATEGORY-NOT: character Cn ;
+CATEGORY: math Sm | "Other_Math" property? ;
--- /dev/null
+Daniel Ehrenberg
--- /dev/null
+Parsing words used by Unicode implementation
--- /dev/null
+! Copyright (C) 2008 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.syntax help.markup ;
+IN: unicode.categories.syntax
+
+ABOUT: "unicode.categories.syntax"
+
+ARTICLE: "unicode.categories.syntax" "Unicode category syntax"
+"There is special syntax sugar for making predicate classes which are unions of Unicode general categories, plus some other code."
+{ $subsection POSTPONE: CATEGORY: }
+{ $subsection POSTPONE: CATEGORY-NOT: } ;
+
+HELP: CATEGORY:
+{ $syntax "CATEGORY: foo Nl Pd Lu | \"Diacritic\" property? ;" }
+{ $description "This defines a predicate class which is a subset of code points. In this example, " { $snippet "foo" } " is the class of characters which are in the general category Nl or Pd or Lu, or which have the Diacritic property." } ;
+
+HELP: CATEGORY-NOT:
+{ $syntax "CATEGORY-NOT: foo Nl Pd Lu | \"Diacritic\" property? ;" }
+{ $description "This defines a predicate class which is a subset of code points, the complement of what " { $link POSTPONE: CATEGORY: } " would define. In this example, " { $snippet "foo" } " is the class of characters which are neither in the general category Nl or Pd or Lu, nor have the Diacritic property." } ;
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+
--- /dev/null
+! Copyright (C) 2008, 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: unicode.data kernel math sequences parser
+bit-arrays namespaces sequences.private arrays classes.parser
+assocs classes.predicate sets fry splitting accessors ;
+IN: unicode.categories.syntax
+
+! For use in CATEGORY:
+SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co | ;
+
+<PRIVATE
+
+: >category-array ( categories -- bitarray )
+ categories [ swap member? ] with map >bit-array ;
+
+: [category] ( categories code -- quot )
+ [ >category-array ] dip
+ '[ dup category# _ nth-unsafe [ drop t ] _ if ] ;
+
+: define-category ( word categories code -- )
+ [category] integer swap define-predicate-class ;
+
+: parse-category ( -- word tokens quot )
+ CREATE-CLASS \ ; parse-until { | } split1
+ [ [ name>> ] map ]
+ [ [ [ ] like ] [ [ drop f ] ] if* ] bi* ;
+
+PRIVATE>
+
+: CATEGORY:
+ parse-category define-category ; parsing
+
+: CATEGORY-NOT:
+ parse-category
+ [ categories swap diff ] dip
+ define-category ; parsing
io.encodings.ascii kernel values splitting accessors math.parser\r
ascii io assocs strings math namespaces make sorting combinators\r
math.order arrays unicode.normalize unicode.data locals\r
-unicode.syntax macros sequences.deep words unicode.breaks\r
+macros sequences.deep words unicode.breaks\r
quotations combinators.short-circuit simple-flat-file ;\r
IN: unicode.collation\r
\r
! See http://factorcode.org/license.txt for BSD license.
USING: ascii sequences namespaces make unicode.data kernel math arrays
locals sorting.insertion accessors assocs math.order combinators
-unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
+strings sbufs hints combinators.short-circuit vectors ;
IN: unicode.normalize
<PRIVATE
+++ /dev/null
-Daniel Ehrenberg
+++ /dev/null
-Parsing words used by Unicode implementation
+++ /dev/null
-! Copyright (C) 2008 Daniel Ehrenberg.
-! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.data kernel math sequences parser lexer
-bit-arrays namespaces make sequences.private arrays quotations
-assocs classes.predicate math.order strings.parser sets ;
-IN: unicode.syntax
-
-<PRIVATE
-
-: >category-array ( categories -- bitarray )
- categories [ swap member? ] with map >bit-array ;
-
-: as-string ( strings -- bit-array )
- concat unescape-string ;
-
-: [category] ( categories -- quot )
- [
- [ [ categories member? not ] filter as-string ] keep
- [ categories member? ] filter >category-array
- [ dup category# ] % , [ nth-unsafe [ drop t ] ] %
- \ member? 2array >quotation ,
- \ if ,
- ] [ ] make ;
-
-: define-category ( word categories -- )
- [category] integer swap define-predicate-class ;
-
-PRIVATE>
-
-: CATEGORY:
- CREATE ";" parse-tokens define-category ; parsing
-
-: CATEGORY-NOT:
- CREATE ";" parse-tokens
- categories swap diff define-category ; parsing
{ $vocab-subsection "Word and grapheme breaks" "unicode.breaks" }
{ $vocab-subsection "Unicode normalization" "unicode.normalize" }
"The following are mostly for internal use:"
-{ $vocab-subsection "Unicode syntax" "unicode.syntax" }
+{ $vocab-subsection "Unicode category syntax" "unicode.categories.syntax" }
{ $vocab-subsection "Unicode data tables" "unicode.data" }
{ $see-also "ascii" "io.encodings" } ;
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences unicode.syntax math math.order combinators
-hints ;
+USING: kernel sequences unicode.categories.syntax math math.order
+combinators hints ;
IN: xml.char-classes
-CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
-: 1.0name-start? ( char -- ? )
- dup 1.0name-start*? [ drop t ]
- [ HEX: 2BB HEX: 2C1 between? ] if ;
+CATEGORY: 1.0name-start
+ Ll Lu Lo Lt Nl | {
+ [ HEX: 2BB HEX: 2C1 between? ]
+ [ "\u000559\u0006E5\u0006E6_:" member? ]
+ } 1|| ;
-CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;
+CATEGORY: 1.0name-char
+ Ll Lu Lo Lt Nl Mc Me Mn Lm Nd |
+ "_-.\u000387:" member? ;
-CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;
+CATEGORY: 1.1name-start
+ Ll Lu Lo Lm Ln Nl |
+ "_:" member? ;
-CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
+CATEGORY: 1.1name-char
+ Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf |
+ "_-.\u0000b7:" member? ;
: name-start? ( 1.0? char -- ? )
swap [ 1.0name-start? ] [ 1.1name-start? ] if ;