From: Daniel Ehrenberg Date: Sat, 21 Mar 2009 06:11:45 +0000 (-0500) Subject: Moving unicode.syntax to unicode.categories.syntax; documenting and modifying syntax X-Git-Tag: 0.94~1877^2~57^2~2 X-Git-Url: https://gitweb.factorcode.org/gitweb.cgi?p=factor.git;a=commitdiff_plain;h=62638fb4d30fc1b6126ab84737cfec7305546b56 Moving unicode.syntax to unicode.categories.syntax; documenting and modifying syntax --- diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor index f397ebb2de..22d6cddfb9 100644 --- a/basis/unicode/breaks/breaks.factor +++ b/basis/unicode/breaks/breaks.factor @@ -4,7 +4,7 @@ USING: combinators.short-circuit unicode.categories kernel math combinators splitting sequences math.parser io.files io assocs arrays namespaces make math.ranges unicode.normalize unicode.normalize.private values io.encodings.ascii -unicode.syntax unicode.data compiler.units fry +unicode.data compiler.units fry unicode.categories.syntax alien.syntax sets accessors interval-maps memoize locals words simple-flat-file ; IN: unicode.breaks @@ -32,9 +32,9 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ; [ drop Control ] } case ; -CATEGORY: (extend) Me Mn ; -: extend? ( ch -- ? ) - { [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ; +CATEGORY: extend + Me Mn | + "Other_Grapheme_Extend" property? ; : loe? ( ch -- ? ) "Logical_Order_Exception" property? ; diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor index fa842b8b81..1ad3931746 100644 --- a/basis/unicode/case/case.factor +++ b/basis/unicode/case/case.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008, 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: unicode.data sequences namespaces -sbufs make unicode.syntax unicode.normalize math hints -unicode.categories combinators unicode.syntax assocs combinators.short-circuit +sbufs make unicode.normalize math hints +unicode.categories combinators assocs combinators.short-circuit strings splitting kernel accessors unicode.breaks fry locals ; QUALIFIED: ascii IN: unicode.case diff --git a/basis/unicode/categories/categories-docs.factor b/basis/unicode/categories/categories-docs.factor index b0870e28fb..924b197417 100644 --- a/basis/unicode/categories/categories-docs.factor +++ b/basis/unicode/categories/categories-docs.factor @@ -12,6 +12,9 @@ HELP: Letter HELP: alpha { $class-description "The class of alphanumeric characters." } ; +HELP: math +{ $class-description "The class of Unicode math characters." } ; + HELP: blank { $class-description "The class of whitespace characters." } ; @@ -54,6 +57,8 @@ ARTICLE: "unicode.categories" "Character classes" { $subsection uncased } { $subsection uncased? } { $subsection character } -{ $subsection character? } ; +{ $subsection character? } +{ $subsection math } +{ $subsection math? } ; ABOUT: "unicode.categories" diff --git a/basis/unicode/categories/categories.factor b/basis/unicode/categories/categories.factor index 0464e31b12..126c03c869 100644 --- a/basis/unicode/categories/categories.factor +++ b/basis/unicode/categories/categories.factor @@ -1,15 +1,16 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: unicode.syntax ; +USING: unicode.categories.syntax sequences unicode.data ; IN: unicode.categories -CATEGORY: blank Zs Zl Zp \r\n ; -CATEGORY: letter Ll ; -CATEGORY: LETTER Lu ; -CATEGORY: Letter Lu Ll Lt Lm Lo ; +CATEGORY: blank Zs Zl Zp | "\r\n" member? ; +CATEGORY: letter Ll | "Other_Lowercase" property? ; +CATEGORY: LETTER Lu | "Other_Uppercase" property? ; +CATEGORY: Letter Lu Ll Lt Lm Lo Nl ; CATEGORY: digit Nd Nl No ; CATEGORY-NOT: printable Cc Cf Cs Co Cn ; -CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ; +CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ; CATEGORY: control Cc ; CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ; CATEGORY-NOT: character Cn ; +CATEGORY: math Sm | "Other_Math" property? ; diff --git a/basis/unicode/categories/syntax/authors.txt b/basis/unicode/categories/syntax/authors.txt new file mode 100755 index 0000000000..f990dd0ed2 --- /dev/null +++ b/basis/unicode/categories/syntax/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/basis/unicode/categories/syntax/summary.txt b/basis/unicode/categories/syntax/summary.txt new file mode 100644 index 0000000000..651d51c34c --- /dev/null +++ b/basis/unicode/categories/syntax/summary.txt @@ -0,0 +1 @@ +Parsing words used by Unicode implementation diff --git a/basis/unicode/categories/syntax/syntax-docs.factor b/basis/unicode/categories/syntax/syntax-docs.factor new file mode 100644 index 0000000000..6293b92c72 --- /dev/null +++ b/basis/unicode/categories/syntax/syntax-docs.factor @@ -0,0 +1,19 @@ +! Copyright (C) 2008 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: help.syntax help.markup ; +IN: unicode.categories.syntax + +ABOUT: "unicode.categories.syntax" + +ARTICLE: "unicode.categories.syntax" "Unicode category syntax" +"There is special syntax sugar for making predicate classes which are unions of Unicode general categories, plus some other code." +{ $subsection POSTPONE: CATEGORY: } +{ $subsection POSTPONE: CATEGORY-NOT: } ; + +HELP: CATEGORY: +{ $syntax "CATEGORY: foo Nl Pd Lu | \"Diacritic\" property? ;" } +{ $description "This defines a predicate class which is a subset of code points. In this example, " { $snippet "foo" } " is the class of characters which are in the general category Nl or Pd or Lu, or which have the Diacritic property." } ; + +HELP: CATEGORY-NOT: +{ $syntax "CATEGORY-NOT: foo Nl Pd Lu | \"Diacritic\" property? ;" } +{ $description "This defines a predicate class which is a subset of code points, the complement of what " { $link POSTPONE: CATEGORY: } " would define. In this example, " { $snippet "foo" } " is the class of characters which are neither in the general category Nl or Pd or Lu, nor have the Diacritic property." } ; diff --git a/basis/unicode/categories/syntax/syntax-tests.factor b/basis/unicode/categories/syntax/syntax-tests.factor new file mode 100644 index 0000000000..1ec622fc98 --- /dev/null +++ b/basis/unicode/categories/syntax/syntax-tests.factor @@ -0,0 +1,3 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. + diff --git a/basis/unicode/categories/syntax/syntax.factor b/basis/unicode/categories/syntax/syntax.factor new file mode 100644 index 0000000000..593bb0bbdd --- /dev/null +++ b/basis/unicode/categories/syntax/syntax.factor @@ -0,0 +1,36 @@ +! Copyright (C) 2008, 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: unicode.data kernel math sequences parser +bit-arrays namespaces sequences.private arrays classes.parser +assocs classes.predicate sets fry splitting accessors ; +IN: unicode.categories.syntax + +! For use in CATEGORY: +SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co | ; + +category-array ( categories -- bitarray ) + categories [ swap member? ] with map >bit-array ; + +: [category] ( categories code -- quot ) + [ >category-array ] dip + '[ dup category# _ nth-unsafe [ drop t ] _ if ] ; + +: define-category ( word categories code -- ) + [category] integer swap define-predicate-class ; + +: parse-category ( -- word tokens quot ) + CREATE-CLASS \ ; parse-until { | } split1 + [ [ name>> ] map ] + [ [ [ ] like ] [ [ drop f ] ] if* ] bi* ; + +PRIVATE> + +: CATEGORY: + parse-category define-category ; parsing + +: CATEGORY-NOT: + parse-category + [ categories swap diff ] dip + define-category ; parsing diff --git a/basis/unicode/categories/syntax/tags.txt b/basis/unicode/categories/syntax/tags.txt new file mode 100755 index 0000000000..8e27be7d61 --- /dev/null +++ b/basis/unicode/categories/syntax/tags.txt @@ -0,0 +1 @@ +text diff --git a/basis/unicode/collation/collation.factor b/basis/unicode/collation/collation.factor index 0c51ea4352..b6eddccae0 100755 --- a/basis/unicode/collation/collation.factor +++ b/basis/unicode/collation/collation.factor @@ -4,7 +4,7 @@ USING: combinators.short-circuit sequences io.files io.encodings.ascii kernel values splitting accessors math.parser ascii io assocs strings math namespaces make sorting combinators math.order arrays unicode.normalize unicode.data locals -unicode.syntax macros sequences.deep words unicode.breaks +macros sequences.deep words unicode.breaks quotations combinators.short-circuit simple-flat-file ; IN: unicode.collation diff --git a/basis/unicode/normalize/normalize.factor b/basis/unicode/normalize/normalize.factor index 602d9555ea..aca96a5694 100644 --- a/basis/unicode/normalize/normalize.factor +++ b/basis/unicode/normalize/normalize.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: ascii sequences namespaces make unicode.data kernel math arrays locals sorting.insertion accessors assocs math.order combinators -unicode.syntax strings sbufs hints combinators.short-circuit vectors ; +strings sbufs hints combinators.short-circuit vectors ; IN: unicode.normalize category-array ( categories -- bitarray ) - categories [ swap member? ] with map >bit-array ; - -: as-string ( strings -- bit-array ) - concat unescape-string ; - -: [category] ( categories -- quot ) - [ - [ [ categories member? not ] filter as-string ] keep - [ categories member? ] filter >category-array - [ dup category# ] % , [ nth-unsafe [ drop t ] ] % - \ member? 2array >quotation , - \ if , - ] [ ] make ; - -: define-category ( word categories -- ) - [category] integer swap define-predicate-class ; - -PRIVATE> - -: CATEGORY: - CREATE ";" parse-tokens define-category ; parsing - -: CATEGORY-NOT: - CREATE ";" parse-tokens - categories swap diff define-category ; parsing diff --git a/basis/unicode/syntax/tags.txt b/basis/unicode/syntax/tags.txt deleted file mode 100755 index 8e27be7d61..0000000000 --- a/basis/unicode/syntax/tags.txt +++ /dev/null @@ -1 +0,0 @@ -text diff --git a/basis/unicode/unicode-docs.factor b/basis/unicode/unicode-docs.factor index 4ae326ac84..9450b49f0b 100644 --- a/basis/unicode/unicode-docs.factor +++ b/basis/unicode/unicode-docs.factor @@ -15,7 +15,7 @@ $nl { $vocab-subsection "Word and grapheme breaks" "unicode.breaks" } { $vocab-subsection "Unicode normalization" "unicode.normalize" } "The following are mostly for internal use:" -{ $vocab-subsection "Unicode syntax" "unicode.syntax" } +{ $vocab-subsection "Unicode category syntax" "unicode.categories.syntax" } { $vocab-subsection "Unicode data tables" "unicode.data" } { $see-also "ascii" "io.encodings" } ; diff --git a/basis/xml/char-classes/char-classes.factor b/basis/xml/char-classes/char-classes.factor index d510c8a881..153fca0bb7 100644 --- a/basis/xml/char-classes/char-classes.factor +++ b/basis/xml/char-classes/char-classes.factor @@ -1,19 +1,26 @@ ! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: kernel sequences unicode.syntax math math.order combinators -hints ; +USING: kernel sequences unicode.categories.syntax math math.order +combinators hints ; IN: xml.char-classes -CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ; -: 1.0name-start? ( char -- ? ) - dup 1.0name-start*? [ drop t ] - [ HEX: 2BB HEX: 2C1 between? ] if ; +CATEGORY: 1.0name-start + Ll Lu Lo Lt Nl | { + [ HEX: 2BB HEX: 2C1 between? ] + [ "\u000559\u0006E5\u0006E6_:" member? ] + } 1|| ; -CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ; +CATEGORY: 1.0name-char + Ll Lu Lo Lt Nl Mc Me Mn Lm Nd | + "_-.\u000387:" member? ; -CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ; +CATEGORY: 1.1name-start + Ll Lu Lo Lm Ln Nl | + "_:" member? ; -CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ; +CATEGORY: 1.1name-char + Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf | + "_-.\u0000b7:" member? ; : name-start? ( 1.0? char -- ? ) swap [ 1.0name-start? ] [ 1.1name-start? ] if ;