Merge branch 'master' into finish_syntax_colon

author Slava Pestov <slava@slava-pestovs-macbook-pro.local>

Sat, 21 Mar 2009 23:13:47 +0000 (18:13 -0500)

committer Slava Pestov <slava@slava-pestovs-macbook-pro.local>

Sat, 21 Mar 2009 23:13:47 +0000 (18:13 -0500)
author Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Sat, 21 Mar 2009 23:13:47 +0000 (18:13 -0500)
committer Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Sat, 21 Mar 2009 23:13:47 +0000 (18:13 -0500)
diff --git a/basis/html/templates/chloe/components/components.factor b/basis/html/templates/chloe/components/components.factor

index 19f2019266f4fc142bf1962901f86b954bca6254..d69dc085371f28d0a7041f6432630e7a6ac82131 100644 (file)
--- a/basis/html/templates/chloe/components/components.factor
+++ b/basis/html/templates/chloe/components/components.factor
@@ -25,8 +25,7 @@ M: tuple-class component-tag ( tag class -- )
      [ compile-component-attrs ] 2bi
      [ render ] [code] ;
  
-: COMPONENT:
+SYNTAX: COMPONENT:
      scan-word
      [ name>> ] [ '[ _ component-tag ] ] bi
      define-chloe-tag ;
-    parsing
diff --git a/basis/io/encodings/iso2022/iso2022-tests.factor b/basis/io/encodings/iso2022/iso2022-tests.factor

index b8a628c8ba16ab414d61e40792a9858f5c56ed01..9111eee9559bca070e513b5faf29893b61878255 100644 (file)
--- a/basis/io/encodings/iso2022/iso2022-tests.factor
+++ b/basis/io/encodings/iso2022/iso2022-tests.factor
@@ -7,30 +7,30 @@ IN: io.encodings.iso2022
  [ "hello" ] [ "hello" >byte-array iso2022 decode ] unit-test
  [ "hello" ] [ "hello" iso2022 encode >string ] unit-test
  
-[ "hi" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: B CHAR: i } iso2022 decode ] unit-test
-[ "hi" ] [ B{ CHAR: h CHAR: i $ ESC CHAR: ( CHAR: B } iso2022 decode ] unit-test
-[ "hi\u00fffd" ] [ B{ CHAR: h CHAR: i $ ESC CHAR: ( } iso2022 decode ] unit-test
-[ "hi\u00fffd" ] [ B{ CHAR: h CHAR: i $ ESC } iso2022 decode ] unit-test
+[ "hi" ] [ B{ CHAR: h ESC CHAR: ( CHAR: B CHAR: i } iso2022 decode ] unit-test
+[ "hi" ] [ B{ CHAR: h CHAR: i ESC CHAR: ( CHAR: B } iso2022 decode ] unit-test
+[ "hi\u00fffd" ] [ B{ CHAR: h CHAR: i ESC CHAR: ( } iso2022 decode ] unit-test
+[ "hi\u00fffd" ] [ B{ CHAR: h CHAR: i ESC } iso2022 decode ] unit-test
  
-[ B{ CHAR: h $ ESC CHAR: ( CHAR: J HEX: D8 } ] [ "h\u00ff98" iso2022 encode ] unit-test
-[ "h\u00ff98" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J HEX: D8 } iso2022 decode ] unit-test
-[ "hi" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J CHAR: i } iso2022 decode ] unit-test
-[ "h" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J HEX: 80 } iso2022 decode ] unit-test
+[ B{ CHAR: h ESC CHAR: ( CHAR: J HEX: D8 } ] [ "h\u00ff98" iso2022 encode ] unit-test
+[ "h\u00ff98" ] [ B{ CHAR: h ESC CHAR: ( CHAR: J HEX: D8 } iso2022 decode ] unit-test
+[ "hi" ] [ B{ CHAR: h ESC CHAR: ( CHAR: J CHAR: i } iso2022 decode ] unit-test
+[ "h" ] [ B{ CHAR: h ESC CHAR: ( CHAR: J } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC CHAR: ( CHAR: J HEX: 80 } iso2022 decode ] unit-test
  
-[ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 3E HEX: 47 } ] [ "h\u007126" iso2022 encode ] unit-test
-[ "h\u007126" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 3E HEX: 47 } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 3E } iso2022 decode ] unit-test
-[ "h" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 80 HEX: 80 } iso2022 decode ] unit-test
+[ B{ CHAR: h ESC CHAR: $ CHAR: B HEX: 3E HEX: 47 } ] [ "h\u007126" iso2022 encode ] unit-test
+[ "h\u007126" ] [ B{ CHAR: h ESC CHAR: $ CHAR: B HEX: 3E HEX: 47 } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC CHAR: $ CHAR: B HEX: 3E } iso2022 decode ] unit-test
+[ "h" ] [ B{ CHAR: h ESC CHAR: $ CHAR: B } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC CHAR: $ } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC CHAR: $ CHAR: B HEX: 80 HEX: 80 } iso2022 decode ] unit-test
  
-[ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 HEX: 54 } ] [ "h\u0058ce" iso2022 encode ] unit-test
-[ "h\u0058ce" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 HEX: 54 } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 } iso2022 decode ] unit-test
-[ "h" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( } iso2022 decode ] unit-test
-[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 70 HEX: 70 } iso2022 decode ] unit-test
+[ B{ CHAR: h ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 HEX: 54 } ] [ "h\u0058ce" iso2022 encode ] unit-test
+[ "h\u0058ce" ] [ B{ CHAR: h ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 HEX: 54 } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 } iso2022 decode ] unit-test
+[ "h" ] [ B{ CHAR: h ESC CHAR: $ CHAR: ( CHAR: D } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC CHAR: $ CHAR: ( } iso2022 decode ] unit-test
+[ "h\u00fffd" ] [ B{ CHAR: h ESC CHAR: $ CHAR: ( CHAR: D HEX: 70 HEX: 70 } iso2022 decode ] unit-test
  
  [ "\u{syriac-music}" iso2022 encode ] must-fail
diff --git a/basis/io/encodings/iso2022/iso2022.factor b/basis/io/encodings/iso2022/iso2022.factor

index 3dabb894e41602cb29ee75b157862bbf8e2d1b32..a057df28e0aa2a0b90e60b2fba0b1568b4fbafe4 100644 (file)
--- a/basis/io/encodings/iso2022/iso2022.factor
+++ b/basis/io/encodings/iso2022/iso2022.factor
@@ -2,7 +2,7 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: io.encodings kernel sequences io simple-flat-file sets math
  combinators.short-circuit io.binary values arrays assocs
-locals accessors combinators literals biassocs byte-arrays ;
+locals accessors combinators biassocs byte-arrays parser ;
  IN: io.encodings.iso2022
  
  SINGLETON: iso2022
@@ -31,12 +31,12 @@ M: iso2022 <encoder>
  M: iso2022 <decoder>
      make-iso-coder <decoder> ;
  
-CONSTANT: ESC HEX: 16
+<< SYNTAX: ESC HEX: 16 parsed ; >>
  
-CONSTANT: switch-ascii B{ $ ESC CHAR: ( CHAR: B }
-CONSTANT: switch-jis201 B{ $ ESC CHAR: ( CHAR: J }
-CONSTANT: switch-jis208 B{ $ ESC CHAR: $ CHAR: B }
-CONSTANT: switch-jis212 B{ $ ESC CHAR: $ CHAR: ( CHAR: D }
+CONSTANT: switch-ascii B{ ESC CHAR: ( CHAR: B }
+CONSTANT: switch-jis201 B{ ESC CHAR: ( CHAR: J }
+CONSTANT: switch-jis208 B{ ESC CHAR: $ CHAR: B }
+CONSTANT: switch-jis212 B{ ESC CHAR: $ CHAR: ( CHAR: D }
  
  : find-type ( char -- code type )
      {
diff --git a/basis/regexp/authors.txt b/basis/regexp/authors.txt

index 7c1b2f22790bfdca05f14a555a40b7eaa3ce2abd..a4a77d97e963679ec4dbe6317c19e936c2ce96d9 100644 (file)
--- a/basis/regexp/authors.txt
+++ b/basis/regexp/authors.txt
@@ -1 +1,2 @@
  Doug Coleman
+Daniel Ehrenberg
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index e114dea26013f5be5123d060645bd141f6b2ecf4..a1c4e3ca2a53cc3e01725d62f307a8a45b3e823c 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -230,7 +230,10 @@ TUPLE: class-partition integers not-integers simples not-simples and or other ;
      dup or-class flatten partition-classes
      dup not-integers>> length {
          { 0 [ nip make-or-class ] }
-        { 1 [ not-integers>> first [ class>> '[ _ swap class-member? ] any? ] keep or ] }
+        { 1 [
+            not-integers>> first
+            [ class>> '[ _ swap class-member? ] any? ] keep or
+        ] }
          [ 3drop t ]
      } case ;
  
@@ -251,6 +254,12 @@ M: or-class <not-class>
  M: t <not-class> drop f ;
  M: f <not-class> drop t ;
  
+: <minus-class> ( a b -- a-b )
+    <not-class> 2array <and-class> ;
+
+: <sym-diff-class> ( a b -- a~b )
+    2array [ <or-class> ] [ <and-class> ] bi <minus-class> ;
+
  M: primitive-class class-member?
      class>> class-member? ;
  
diff --git a/basis/regexp/parser/parser-tests.factor b/basis/regexp/parser/parser-tests.factor

index d606015f617e19e5e3a181174e0425df838593c1..5ea9753fbaf66b9ec2a964a7a8db951f30a0cb9d 100644 (file)
--- a/basis/regexp/parser/parser-tests.factor
+++ b/basis/regexp/parser/parser-tests.factor
@@ -11,7 +11,7 @@ IN: regexp.parser.tests
      "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
      "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
      "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
-    "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
+    "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
      "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
      "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
      "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index e8de469a9475cb7dcce7026b9f4768378a0e101a..9fcadc40084f78d7e3924d2b5264afcee144cab4 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -148,19 +148,29 @@ Character = EscapeSequence
            | "^" => [[ ^ <tagged-epsilon> ]]
            | . ?[ allowed-char? ]?
  
-AnyRangeCharacter = EscapeSequence | .
+AnyRangeCharacter = !("&&"|"||"|"--"|"~~") (EscapeSequence | .)
  
  RangeCharacter = !("]") AnyRangeCharacter
  
-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
+Range = RangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
        | RangeCharacter
  
-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
+StartRange = AnyRangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
             | AnyRangeCharacter
  
  Ranges = StartRange:s Range*:r => [[ r s prefix ]]
  
-CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+BasicCharClass =  "^"?:n Ranges:e => [[ e n char-class ]]
+
+CharClass = BasicCharClass:b "&&" CharClass:c
+                => [[ b c 2array <and-class> ]]
+          | BasicCharClass:b "||" CharClass:c
+                => [[ b c 2array <or-class> ]]
+          | BasicCharClass:b "~~" CharClass:c
+                => [[ b c <sym-diff-class> ]]
+          | BasicCharClass:b "--" CharClass:c
+                => [[ b c <minus-class> ]]
+          | BasicCharClass
  
  Options = [idmsux]*
  
diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor

index 6d9f03781d53490572ed9720557c7d923265aa6f..2ff31f0cecdba204c80f231728bc5b89b50b33e1 100644 (file)
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -45,11 +45,11 @@ ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
  ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
  "Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented."
  { $heading "Characters" }
-"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } "for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
+"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } " for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
  { $heading "Concatenation, alternation and grouping" }
-"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
+"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for grouping. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
  { $heading "Character classes" }
-"Square brackets define a convenient way to refer to a set of characters. For example, " { $snippet "[ab]" } " refers to either a or b. And " { $snippet "[a-z]" } " refers to all of the characters between a and z, in code point order. You can use these together, as in " { $snippet "[ac-fz]" } " which matches all of the characters between c and f, in addition to a and z. Character classes can be negated using a carat, as in " { $snippet "[^a]" } " which matches all characters which are not a."
+"Square brackets define a convenient way to refer to a set of characters. For example, " { $snippet "[ab]" } " refers to either a or b. And " { $snippet "[a-z]" } " refers to all of the characters between a and z, in code point order. You can use these together, as in " { $snippet "[ac-fz]" } " which matches all of the characters between c and f, in addition to a and z. Character classes can be negated using a caret, as in " { $snippet "[^a]" } " which matches all characters which are not a."
  { $heading "Predefined character classes" }
  "Several character classes are predefined, both for convenience and because they are too large to represent directly. In Factor regular expressions, all character classes are Unicode-aware."
  { $table
@@ -72,10 +72,12 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
      { { $snippet "\\p{blank}" } "Non-newline whitespace" }
      { { $snippet "\\p{cntrl}" } "Control character" }
      { { $snippet "\\p{space}" } "Whitespace" }
-    { { $snippet "\\p{xdigit}" } "Hexidecimal digit" }
+    { { $snippet "\\p{xdigit}" } "Hexadecimal digit" }
      { { $snippet "\\p{Nd}" } "Character in Unicode category Nd" } 
      { { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" } 
      { { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
+{ $heading "Character class operations" }
+"Character classes can be composed using four binary operations: " { $snippet "|| && ~~ --" } ". These do the operations union, intersection, symmetric difference and difference, respectively. For example, characters which are lower-case but not Latin script could be matched as " { $snippet "[\\p{lower}--\\p{script=latin}]" } ". These operations are right-associative, and " { $snippet "^" } " binds tighter than them. There is no syntax for grouping."
  { $heading "Boundaries" }
  "Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
  { $table
@@ -107,9 +109,18 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
  { $heading "Quotation" }
  "To make it convenient to have a long string which uses regexp operators, a special syntax is provided. If a substring begins with " { $snippet "\\Q" } " then everything until " { $snippet "\\E" } " is quoted (escaped). For example, " { $snippet "R/ \\Qfoo\\bar|baz()\\E/" } " matches exactly the string " { $snippet "\"foo\\bar|baz()\"" } "."
  { $heading "Unsupported features" }
-"One missing feature is backreferences. This is because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } ". You can create a new regular expression to match a particular string using " { $vocab-link "regexp.combinators" } " and group capture is available to extract parts of a regular expression match." $nl
-"Another feature is Perl's " { $snippet "\\G" } " syntax, which references the previous match, is not included. This is because that sequence is inherently stateful, and Factor regexps don't hold state." $nl
-"None of the operations which embed code into a regexp are supported, as this would require the inclusion of the Factor parser and compiler in any application which wants to expose regexps to the user. None of the casing operations are included of Perl like \\L, for simplicity." ; ! Also describe syntax, from the beginning
+{ $subheading "Group capture" }
+{ $subheading "Reluctant and posessive quantifiers" }
+{ $subheading "Backreferences" }
+"Backreferences were omitted because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } "."
+$nl
+"To work around the lack of backreferences, consider using group capture and then creating a new regular expression to match the captured string using " { $vocab-link "regexp.combinators" } "."
+{ $subheading "Previous match" }
+"Another feature that is not included is Perl's " { $snippet "\\G" } " syntax, which references the previous match. This is because that sequence is inherently stateful, and Factor regexps don't hold state."
+{ $subheading "Embedding code" }
+"Operations which embed code into a regexp are not supported. This would require the inclusion of the Factor parser and compiler in any deployed application which wants to expose regexps to the user, leading to an undesirable increase in the code size."
+{ $heading "Casing operations" }
+"No special casing operations are included, for example Perl's " { $snippet "\\L" } "." ;
  
  ARTICLE: { "regexp" "options" } "Regular expression options"
  "When " { $link { "regexp" "construction" } } ", various options can be provided. Options have single-character names. A string of options has one of the following two forms:"
@@ -152,7 +163,7 @@ ARTICLE: { "regexp" "theory" } "The theory of regular expressions"
  "This implies, by DeMorgan's law, that, if you have two regular languages, their intersection is also regular. That is, for any two regular expressions, there exists a regular expression which matches strings that match both inputs." $nl
  "Traditionally, regular expressions on computer support an additional operation: backreferences. For example, the Perl regexp " { $snippet "/(.*)$1/" } " matches a string repated twice. If a backreference refers to a string with a predetermined maximum length, then the resulting language is still regular." $nl
  "But, if not, the language is not regular. There is strong evidence that there is no efficient way to parse with backreferences in the general case. Perl uses a naive backtracking algorithm which has pathological behavior in some cases, taking exponential time to match even if backreferences aren't used. Additionally, expressions with backreferences don't have the properties with negation and intersection described above." $nl
-"The Factor regular expression engine was built with the design decision to support negation and intersection at the expense of backreferences. This lets us have a guaranteed linear-time matching algorithm. Systems like Ragel and Lex also use this algorithm, but in the Factor regular expression engine, all other features of regexps are still present." ;
+"The Factor regular expression engine was built with the design decision to support negation and intersection at the expense of backreferences. This lets us have a guaranteed linear-time matching algorithm. Systems like Ragel and Lex use the same algorithm." ;
  
  ARTICLE: { "regexp" "operations" } "Matching operations with regular expressions"
  "Testing if a string matches a regular expression:"
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index 999caeaed63c1a5935ada2db98625b35754f8362..22343868032108956f864c87579bf6f61736c5be 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -508,3 +508,29 @@ IN: regexp-tests
  [ t ] [ " " R/ \P{LL}/ matches? ] unit-test
  [ f ] [ "a" R/ \P{sCriPt = latin}/ matches? ] unit-test
  [ t ] [ " " R/ \P{SCRIPT = laTIn}/ matches? ] unit-test
+
+! Logical operators
+[ t ] [ "a" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+[ t ] [ "π" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+
+[ t ] [ "a" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+[ t ] [ "π" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+
+[ t ] [ "a" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+[ f ] [ "π" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+[ f ] [ "A" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+
+[ f ] [ "a" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+[ t ] [ "π" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+
+[ f ] [ "a" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
+[ f ] [ "π" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
diff --git a/basis/simple-flat-file/simple-flat-file-docs.factor b/basis/simple-flat-file/simple-flat-file-docs.factor

index 9ed5de7d2b3da93614db649879e51930e08b8e93..0223d94af9b0f16301dd145046929ea7355823fb 100644 (file)
--- a/basis/simple-flat-file/simple-flat-file-docs.factor
+++ b/basis/simple-flat-file/simple-flat-file-docs.factor
@@ -1,8 +1,24 @@
-USING: help.syntax help.markup strings ;
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.syntax help.markup strings biassocs arrays ;
  IN: simple-flat-file
  
  ABOUT: "simple-flat-file"
  
  ARTICLE: "simple-flat-file" "Parsing simple flat files"
-"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding tasks."
-{ $subsection flat-file>biassoc } ;
+"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding and Unicode tasks."
+{ $subsection flat-file>biassoc }
+{ $subsection load-interval-file }
+{ $subsection data } ;
+
+HELP: load-interval-file
+{ $values { "filename" string } { "table" "an interval map" } }
+{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
+
+HELP: data
+{ $values { "filename" string } { "data" array } }
+{ $description "This loads a file that's delineated by semicolons and lines, returning an array of lines, where each line is an array split by the semicolons, with whitespace trimmed off." } ;
+
+HELP: flat-file>biassoc
+{ $values { "filename" string } { "biassoc" biassoc } }
+{ $description "This loads a flat file, in the form that many encoding resource files are in, with two columns of numeric data in hex, and returns a biassoc associating them." } ;
diff --git a/basis/simple-flat-file/simple-flat-file.factor b/basis/simple-flat-file/simple-flat-file.factor

index 6e53c97738d476a4c99098b0745ceb95239715b8..88a64b7746592e0c218c8a7a0d4b6bdefcb5a00c 100644 (file)
--- a/basis/simple-flat-file/simple-flat-file.factor
+++ b/basis/simple-flat-file/simple-flat-file.factor
@@ -1,7 +1,7 @@
  ! Copyright (C) 2009 Daniel Ehrenberg
  ! See http://factorcode.org/license.txt for BSD license.
  USING: sequences splitting kernel math.parser io.files io.encodings.utf8
-biassocs ascii ;
+biassocs ascii namespaces arrays make assocs interval-maps sets ;
  IN: simple-flat-file
  
  : drop-comments ( seq -- newseq )
@@ -30,3 +30,25 @@ IN: simple-flat-file
  
  : data ( filename -- data )
      utf8 file-lines drop-comments [ split-; ] map ;
+
+SYMBOL: interned
+
+: range, ( value key -- )
+    swap interned get
+    [ = ] with find nip 2array , ;
+
+: expand-ranges ( assoc -- interval-map )
+    [
+        [
+            swap CHAR: . over member? [
+                ".." split1 [ hex> ] bi@ 2array
+            ] [ hex> ] if range,
+        ] assoc-each
+    ] { } make <interval-map> ;
+
+: process-interval-file ( ranges -- table )
+    dup values prune interned
+    [ expand-ranges ] with-variable ;
+
+: load-interval-file ( filename -- table )
+    data process-interval-file ;
diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor

index 91f6a45911cce51ada86bbd44f4c6896ec1e6edd..22d6cddfb973c40b46fff7f019ff6acd8e353556 100644 (file)
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@@ -4,8 +4,9 @@ USING: combinators.short-circuit unicode.categories kernel math
  combinators splitting sequences math.parser io.files io assocs
  arrays namespaces make math.ranges unicode.normalize
  unicode.normalize.private values io.encodings.ascii
-unicode.syntax unicode.data compiler.units fry
-alien.syntax sets accessors interval-maps memoize locals words ;
+unicode.data compiler.units fry unicode.categories.syntax
+alien.syntax sets accessors interval-maps memoize locals words
+simple-flat-file ;
  IN: unicode.breaks
  
  <PRIVATE
@@ -31,9 +32,9 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
          [ drop Control ]
      } case ;
  
-CATEGORY: (extend) Me Mn ;
-: extend? ( ch -- ? )
-    { [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
+CATEGORY: extend
+    Me Mn |
+    "Other_Grapheme_Extend" property? ;
  
  : loe? ( ch -- ? )
      "Logical_Order_Exception" property? ;
@@ -127,7 +128,7 @@ to: grapheme-table
  
  VALUE: word-break-table
  
-"vocab:unicode/data/WordBreakProperty.txt" load-key-value
+"vocab:unicode/data/WordBreakProperty.txt" load-interval-file
  to: word-break-table
  
  C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor

index fa842b8b818a1bed743ea5e46c647877f4c7469d..1ad39317469939c54b144961b84f3df21598c440 100644 (file)
--- a/basis/unicode/case/case.factor
+++ b/basis/unicode/case/case.factor
@@ -1,8 +1,8 @@
  ! Copyright (C) 2008, 2009 Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: unicode.data sequences namespaces
-sbufs make unicode.syntax unicode.normalize math hints
-unicode.categories combinators unicode.syntax assocs combinators.short-circuit
+sbufs make unicode.normalize math hints
+unicode.categories combinators assocs combinators.short-circuit
  strings splitting kernel accessors unicode.breaks fry locals ;
  QUALIFIED: ascii
  IN: unicode.case
diff --git a/basis/unicode/categories/categories-docs.factor b/basis/unicode/categories/categories-docs.factor

index b0870e28fb881c90705b87383449d9bccada73bc..924b197417d74fa72ffa29dca29b85c3726e0fbc 100644 (file)
--- a/basis/unicode/categories/categories-docs.factor
+++ b/basis/unicode/categories/categories-docs.factor
@@ -12,6 +12,9 @@ HELP: Letter
  HELP: alpha
  { $class-description "The class of alphanumeric characters." } ;
  
+HELP: math
+{ $class-description "The class of Unicode math characters." } ;
+
  HELP: blank
  { $class-description "The class of whitespace characters." } ;
  
@@ -54,6 +57,8 @@ ARTICLE: "unicode.categories" "Character classes"
  { $subsection uncased }
  { $subsection uncased? }
  { $subsection character }
-{ $subsection character? } ;
+{ $subsection character? }
+{ $subsection math }
+{ $subsection math? } ;
  
  ABOUT: "unicode.categories"
diff --git a/basis/unicode/categories/categories.factor b/basis/unicode/categories/categories.factor

index 0464e31b125063b60fa21489d8865b055efd60b4..126c03c8698c431e5fea9b32be446675122f1948 100644 (file)
--- a/basis/unicode/categories/categories.factor
+++ b/basis/unicode/categories/categories.factor
@@ -1,15 +1,16 @@
  ! Copyright (C) 2008 Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.syntax ;
+USING: unicode.categories.syntax sequences unicode.data ;
  IN: unicode.categories
  
-CATEGORY: blank Zs Zl Zp \r\n ;
-CATEGORY: letter Ll ;
-CATEGORY: LETTER Lu ;
-CATEGORY: Letter Lu Ll Lt Lm Lo ;
+CATEGORY: blank Zs Zl Zp | "\r\n" member? ;
+CATEGORY: letter Ll | "Other_Lowercase" property? ;
+CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
+CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
  CATEGORY: digit Nd Nl No ;
  CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
-CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
+CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
  CATEGORY: control Cc ;
  CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ; 
  CATEGORY-NOT: character Cn ;
+CATEGORY: math Sm | "Other_Math" property? ;
diff --git a/basis/unicode/categories/syntax/authors.txt b/basis/unicode/categories/syntax/authors.txt

new file mode 100755 (executable)

index 0000000..f990dd0
--- /dev/null
+++ b/basis/unicode/categories/syntax/authors.txt
@@ -0,0 +1 @@
+Daniel Ehrenberg
diff --git a/basis/unicode/categories/syntax/summary.txt b/basis/unicode/categories/syntax/summary.txt

new file mode 100644 (file)

index 0000000..651d51c
--- /dev/null
+++ b/basis/unicode/categories/syntax/summary.txt
@@ -0,0 +1 @@
+Parsing words used by Unicode implementation
diff --git a/basis/unicode/categories/syntax/syntax-docs.factor b/basis/unicode/categories/syntax/syntax-docs.factor

new file mode 100644 (file)

index 0000000..6293b92
--- /dev/null
+++ b/basis/unicode/categories/syntax/syntax-docs.factor
@@ -0,0 +1,19 @@
+! Copyright (C) 2008 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.syntax help.markup ;
+IN: unicode.categories.syntax
+
+ABOUT: "unicode.categories.syntax"
+
+ARTICLE: "unicode.categories.syntax" "Unicode category syntax"
+"There is special syntax sugar for making predicate classes which are unions of Unicode general categories, plus some other code."
+{ $subsection POSTPONE: CATEGORY: }
+{ $subsection POSTPONE: CATEGORY-NOT: } ;
+
+HELP: CATEGORY:
+{ $syntax "CATEGORY: foo Nl Pd Lu | \"Diacritic\" property? ;" }
+{ $description "This defines a predicate class which is a subset of code points. In this example, " { $snippet "foo" } " is the class of characters which are in the general category Nl or Pd or Lu, or which have the Diacritic property." } ;
+
+HELP: CATEGORY-NOT:
+{ $syntax "CATEGORY-NOT: foo Nl Pd Lu | \"Diacritic\" property? ;" }
+{ $description "This defines a predicate class which is a subset of code points, the complement of what " { $link POSTPONE: CATEGORY: } " would define. In this example, " { $snippet "foo" } " is the class of characters which are neither in the general category Nl or Pd or Lu, nor have the Diacritic property." } ;
diff --git a/basis/unicode/categories/syntax/syntax-tests.factor b/basis/unicode/categories/syntax/syntax-tests.factor

new file mode 100644 (file)

index 0000000..1ec622f
--- /dev/null
+++ b/basis/unicode/categories/syntax/syntax-tests.factor
@@ -0,0 +1,3 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+
diff --git a/basis/unicode/categories/syntax/syntax.factor b/basis/unicode/categories/syntax/syntax.factor

new file mode 100644 (file)

index 0000000..849f361
--- /dev/null
+++ b/basis/unicode/categories/syntax/syntax.factor
@@ -0,0 +1,34 @@
+! Copyright (C) 2008, 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: unicode.data kernel math sequences parser unicode.data.private
+bit-arrays namespaces sequences.private arrays classes.parser
+assocs classes.predicate sets fry splitting accessors ;
+IN: unicode.categories.syntax
+
+! For use in CATEGORY:
+SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co | ;
+
+<PRIVATE
+
+: [category] ( categories code -- quot )
+    '[ dup category# _ member? [ drop t ] _ if ] ;
+
+: integer-predicate-class ( word predicate -- )
+    integer swap define-predicate-class ;
+
+: define-category ( word categories code -- )
+    [category] integer-predicate-class ;
+
+: define-not-category ( word categories code -- )
+    [category] [ not ] compose integer-predicate-class ;
+
+: parse-category ( -- word tokens quot )
+    CREATE-CLASS \ ; parse-until { | } split1
+    [ [ name>> categories-map at ] map ]
+    [ [ [ ] like ] [ [ drop f ] ] if* ] bi* ;
+
+PRIVATE>
+
+SYNTAX: CATEGORY: parse-category define-category ;
+
+SYNTAX: CATEGORY-NOT: parse-category define-not-category ;
diff --git a/basis/unicode/categories/syntax/tags.txt b/basis/unicode/categories/syntax/tags.txt

new file mode 100755 (executable)

index 0000000..8e27be7
--- /dev/null
+++ b/basis/unicode/categories/syntax/tags.txt
@@ -0,0 +1 @@
+text
diff --git a/basis/unicode/collation/collation.factor b/basis/unicode/collation/collation.factor

index 0c51ea4352efda97386b7528886f4bc2639a288c..b6eddccae074f7257f9226af3e3f217c6b02bb5e 100755 (executable)
--- a/basis/unicode/collation/collation.factor
+++ b/basis/unicode/collation/collation.factor
@@ -4,7 +4,7 @@ USING: combinators.short-circuit sequences io.files
  io.encodings.ascii kernel values splitting accessors math.parser\r
  ascii io assocs strings math namespaces make sorting combinators\r
  math.order arrays unicode.normalize unicode.data locals\r
-unicode.syntax macros sequences.deep words unicode.breaks\r
+macros sequences.deep words unicode.breaks\r
  quotations combinators.short-circuit simple-flat-file ;\r
  IN: unicode.collation\r
  \r
diff --git a/basis/unicode/data/data-docs.factor b/basis/unicode/data/data-docs.factor

index d1a458eb480066de5fe5bb8fd913f30375268156..82706729bf94ee7e2ed6ea97e5a13ab0b63b8517 100644 (file)
--- a/basis/unicode/data/data-docs.factor
+++ b/basis/unicode/data/data-docs.factor
@@ -6,7 +6,7 @@ IN: unicode.data
  ABOUT: "unicode.data"
  
  ARTICLE: "unicode.data" "Unicode data tables"
-"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files."
+"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files. The following words access these data tables."
  { $subsection canonical-entry }
  { $subsection combine-chars }
  { $subsection combining-class }
@@ -14,7 +14,11 @@ ARTICLE: "unicode.data" "Unicode data tables"
  { $subsection name>char }
  { $subsection char>name }
  { $subsection property? }
-{ $subsection load-key-value } ;
+{ $subsection category }
+{ $subsection ch>upper }
+{ $subsection ch>lower } 
+{ $subsection ch>title } 
+{ $subsection special-case } ;
  
  HELP: canonical-entry
  { $values { "char" "a code point" } { "seq" string } }
@@ -48,6 +52,22 @@ HELP: property?
  { $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
  { $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
  
-HELP: load-key-value
-{ $values { "filename" string } { "table" "an interval map" } }
-{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
+HELP: category
+{ $values { "char" "a code point" } { "category" string } }
+{ $description "Returns the general category of a code point, in the form of a string. This will always be a string within the ASCII range of length two. If the code point is unassigned, then it returns " { $snippet "Cn" } "." } ;
+
+HELP: ch>upper
+{ $values { "ch" "a code point" } { "upper" "a code point" } }
+{ $description "Returns the simple upper-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
+
+HELP: ch>lower
+{ $values { "ch" "a code point" } { "lower" "a code point" } }
+{ $description "Returns the simple lower-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
+
+HELP: ch>title
+{ $values { "ch" "a code point" } { "title" "a code point" } }
+{ $description "Returns the simple title-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
+
+HELP: special-case
+{ $values { "ch" "a code point" } { "casing-tuple" { "a tuple, or " { $link f } } } }
+{ $description "If a code point has special casing behavior, returns a tuple which represents that information." } ;
diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor

index e94036a85e6cf4bb6944526ee2b6b97b41e58547..779ae64d485b3ee293c251183b5217d697a995df 100644 (file)
--- a/basis/unicode/data/data.factor
+++ b/basis/unicode/data/data.factor
@@ -58,7 +58,7 @@ CONSTANT: num-chars HEX: 2FA1E
  
  PRIVATE>
  
-: category# ( char -- category )
+: category# ( char -- n )
      ! There are a few characters that should be Cn
      ! that this gives Cf or Mn
      ! Cf = 26; Mn = 5; Cn = 29
@@ -219,27 +219,3 @@ load-properties to: properties
  
  [ name>char [ "Invalid character" throw ] unless* ]
  name>char-hook set-global
-
-SYMBOL: interned
-
-: range, ( value key -- )
-    swap interned get
-    [ = ] with find nip 2array , ;
-
-: expand-ranges ( assoc -- interval-map )
-    [
-        [
-            swap CHAR: . over member? [
-                ".." split1 [ hex> ] bi@ 2array
-            ] [ hex> ] if range,
-        ] assoc-each
-    ] { } make <interval-map> ;
-
-: process-key-value ( ranges -- table )
-    dup values prune interned
-    [ expand-ranges ] with-variable ;
-
-PRIVATE>
-
-: load-key-value ( filename -- table )
-    data process-key-value ;
diff --git a/basis/unicode/normalize/normalize.factor b/basis/unicode/normalize/normalize.factor

index 602d9555ea64c26d775f0057cd8b3140b0f1c43f..aca96a56942c315303dc84afd4c52a9061883c7c 100644 (file)
--- a/basis/unicode/normalize/normalize.factor
+++ b/basis/unicode/normalize/normalize.factor
@@ -2,7 +2,7 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: ascii sequences namespaces make unicode.data kernel math arrays
  locals sorting.insertion accessors assocs math.order combinators
-unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
+strings sbufs hints combinators.short-circuit vectors ;
  IN: unicode.normalize
  
  <PRIVATE
diff --git a/basis/unicode/script/script.factor b/basis/unicode/script/script.factor

index ed804760848a07c8c275b4e6dbce4c3d719678e4..4243c816234ffb5de1eb0c5d642b243ff426c380 100644 (file)
--- a/basis/unicode/script/script.factor
+++ b/basis/unicode/script/script.factor
@@ -1,17 +1,13 @@
  ! Copyright (C) 2008 Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors values kernel sequences assocs io.files
-io.encodings ascii math.ranges io splitting math.parser
-namespaces make byte-arrays locals math sets io.encodings.ascii
-words words.symbol compiler.units arrays interval-maps
-unicode.data ;
+USING: values interval-maps simple-flat-file ;
  IN: unicode.script
  
  <PRIVATE
  
  VALUE: script-table
  
-"vocab:unicode/script/Scripts.txt" load-key-value
+"vocab:unicode/script/Scripts.txt" load-interval-file
  to: script-table
  
  PRIVATE>
diff --git a/basis/unicode/syntax/authors.txt b/basis/unicode/syntax/authors.txt

deleted file mode 100755 (executable)

index f990dd0..0000000
--- a/basis/unicode/syntax/authors.txt
+++ /dev/null
@@ -1 +0,0 @@
-Daniel Ehrenberg
diff --git a/basis/unicode/syntax/summary.txt b/basis/unicode/syntax/summary.txt

deleted file mode 100644 (file)

index 651d51c..0000000
--- a/basis/unicode/syntax/summary.txt
+++ /dev/null
@@ -1 +0,0 @@
-Parsing words used by Unicode implementation
diff --git a/basis/unicode/syntax/syntax.factor b/basis/unicode/syntax/syntax.factor

deleted file mode 100644 (file)

index dfae31d..0000000
--- a/basis/unicode/syntax/syntax.factor
+++ /dev/null
@@ -1,38 +0,0 @@
-! Copyright (C) 2008 Daniel Ehrenberg.
-! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.data kernel math sequences parser lexer
-bit-arrays namespaces make sequences.private arrays quotations
-assocs classes.predicate math.order strings.parser ;
-IN: unicode.syntax
-
-<PRIVATE
-
-: >category-array ( categories -- bitarray )
-    categories [ swap member? ] with map >bit-array ;
-
-: as-string ( strings -- bit-array )
-    concat unescape-string ;
-
-: [category] ( categories -- quot )
-    [
-        [ [ categories member? not ] filter as-string ] keep 
-        [ categories member? ] filter >category-array
-        [ dup category# ] % , [ nth-unsafe [ drop t ] ] %
-        \ member? 2array >quotation ,
-        \ if ,
-    ] [ ] make ;
-
-: define-category ( word categories -- )
-    [category] integer swap define-predicate-class ;
-
-PRIVATE>
-
-SYNTAX: CATEGORY:
-    CREATE ";" parse-tokens define-category ;
-
-: seq-minus ( seq1 seq2 -- diff )
-    [ member? not ] curry filter ;
-
-SYNTAX: CATEGORY-NOT:
-    CREATE ";" parse-tokens
-    categories swap seq-minus define-category ;
diff --git a/basis/unicode/syntax/tags.txt b/basis/unicode/syntax/tags.txt

deleted file mode 100755 (executable)

index 8e27be7..0000000
--- a/basis/unicode/syntax/tags.txt
+++ /dev/null
@@ -1 +0,0 @@
-text
diff --git a/basis/unicode/unicode-docs.factor b/basis/unicode/unicode-docs.factor

index 4ae326ac84bf3429c33edb0960b4856fff625277..9450b49f0bd2f14bae20dcc19c15ecfb7093f92c 100644 (file)
--- a/basis/unicode/unicode-docs.factor
+++ b/basis/unicode/unicode-docs.factor
@@ -15,7 +15,7 @@ $nl
  { $vocab-subsection "Word and grapheme breaks" "unicode.breaks" }
  { $vocab-subsection "Unicode normalization" "unicode.normalize" }
  "The following are mostly for internal use:"
-{ $vocab-subsection "Unicode syntax" "unicode.syntax" }
+{ $vocab-subsection "Unicode category syntax" "unicode.categories.syntax" }
  { $vocab-subsection "Unicode data tables" "unicode.data" }
  { $see-also "ascii" "io.encodings" } ;
  
diff --git a/basis/xml/char-classes/char-classes.factor b/basis/xml/char-classes/char-classes.factor

index d510c8a881d47e8d9538db82b0653b0d1b7b3be3..3deab0a2872189681a76e52d5d4a7bd26474b3be 100644 (file)
--- a/basis/xml/char-classes/char-classes.factor
+++ b/basis/xml/char-classes/char-classes.factor
@@ -1,19 +1,26 @@
  ! Copyright (C) 2005, 2009 Daniel Ehrenberg
  ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences unicode.syntax math math.order combinators
-hints ;
+USING: kernel sequences unicode.categories.syntax math math.order
+combinators hints combinators.short-circuit ;
  IN: xml.char-classes
  
-CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
-: 1.0name-start? ( char -- ? )
-    dup 1.0name-start*? [ drop t ] 
-    [ HEX: 2BB HEX: 2C1 between? ] if ;
+CATEGORY: 1.0name-start
+    Ll Lu Lo Lt Nl | {
+        [ HEX: 2BB HEX: 2C1 between? ]
+        [ "\u000559\u0006E5\u0006E6_:" member? ]
+    } 1|| ;
  
-CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;
+CATEGORY: 1.0name-char
+    Ll Lu Lo Lt Nl Mc Me Mn Lm Nd |
+    "_-.\u000387:" member? ;
  
-CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;
+CATEGORY: 1.1name-start
+    Ll Lu Lo Lm Nl |
+    "_:" member? ;
  
-CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
+CATEGORY: 1.1name-char
+    Ll Lu Lo Lm Nl Mc Mn Nd Pc Cf |
+    "_-.\u0000b7:" member? ;
  
  : name-start? ( 1.0? char -- ? )
      swap [ 1.0name-start? ] [ 1.1name-start? ] if ;
diff --git a/extra/mason/child/child.factor b/extra/mason/child/child.factor

index 1999c76d83545fb2ab07f7576ccdb879ade1b643..04c4a09f6186754ea0a142b056ab9df8e3adf03a 100644 (file)
--- a/extra/mason/child/child.factor
+++ b/extra/mason/child/child.factor
@@ -67,7 +67,7 @@ IN: mason.child
          try-process
      ] with-directory ;
  
-: return-with ( obj -- ) return-continuation get continue-with ;
+: return-with ( obj -- * ) return-continuation get continue-with ;
  
  : build-clean? ( -- ? )
      {
diff --git a/extra/mason/report/report.factor b/extra/mason/report/report.factor

index 1b2697a5d1cba3ade471428c3318b4f24058e877..52e1608885f6e3901de4250523d8fbc2aa4ecddc 100644 (file)
--- a/extra/mason/report/report.factor
+++ b/extra/mason/report/report.factor
@@ -16,7 +16,7 @@ IN: mason.report
      "git id: " write "git-id" eval-file print nl ;
  
  : with-report ( quot -- )
-    [ "report" utf8 ] dip '[ common-report @ ] with-file-writer ;
+    [ "report" utf8 ] dip '[ common-report @ ] with-file-writer ; inline
  
  : compile-failed-report ( error -- )
      [
diff --git a/extra/multi-methods/multi-methods.factor b/extra/multi-methods/multi-methods.factor

index 7c5d5fb431c1d01414efeb024af994ca401e9f63..ec069a4894cb7b51b1d2a2366721e12ad7d93e1f 100755 (executable)
--- a/extra/multi-methods/multi-methods.factor
+++ b/extra/multi-methods/multi-methods.factor
@@ -224,8 +224,7 @@ M: no-method error.
      ] if ;
  
  ! Syntax
-: GENERIC:
-    CREATE define-generic ; parsing
+SYNTAX: GENERIC: CREATE define-generic ;
  
  : parse-method ( -- quot classes generic )
      parse-definition [ 2 tail ] [ second ] [ first ] tri ;
@@ -238,13 +237,13 @@ M: no-method error.
  
  : (METHOD:) ( -- method def ) CREATE-METHOD parse-definition ;
  
-: METHOD: (METHOD:) define ; parsing
+SYNTAX: METHOD: (METHOD:) define ;
  
  ! For compatibility
-: M:
+SYNTAX: M:
      scan-word 1array scan-word create-method-in
      parse-definition
-    define ; parsing
+    define ;
  
  ! Definition protocol. We qualify core generics here
  QUALIFIED: syntax
diff --git a/extra/peg-lexer/peg-lexer.factor b/extra/peg-lexer/peg-lexer.factor

index 1b5f17df4c7abbacca362feef92a911455059b1e..90d2e0e34c80d6782552400b2a36d6ba548a26cc 100644 (file)
--- a/extra/peg-lexer/peg-lexer.factor
+++ b/extra/peg-lexer/peg-lexer.factor
@@ -9,36 +9,46 @@ CONSULT: assoc-protocol lex-hash hash>> ;
  : pos-or-0 ( neg? -- pos/0 ) dup 0 < [ drop 0 ] when ;
  
  :: prepare-pos ( v i -- c l )
- [let | n [ i v head-slice ] |
-      v CHAR: \n n last-index -1 or 1+ -
-      n [ CHAR: \n = ] count 1+ ] ;
+    [let | n [ i v head-slice ] |
+           v CHAR: \n n last-index -1 or 1+ -
+           n [ CHAR: \n = ] count 1+
+    ] ;
        
-: store-pos ( v a -- ) input swap at prepare-pos
-   lexer get [ (>>line) ] keep (>>column) ;
+: store-pos ( v a -- )
+    input swap at prepare-pos
+    lexer get [ (>>line) ] keep (>>column) ;
  
-M: lex-hash set-at swap {
-   { pos [ store-pos ] }
-   [ swap hash>> set-at ] } case ;
+M: lex-hash set-at
+    swap {
+        { pos [ store-pos ] }
+        [ swap hash>> set-at ]
+    } case ;
  
  :: at-pos ( t l c -- p ) t l head-slice [ length ] map sum l 1- + c + ;
  
-M: lex-hash at* swap {
+M: lex-hash at*
+    swap {
        { input [ drop lexer get text>> "\n" join t ] }
        { pos [ drop lexer get [ text>> ] [ line>> 1- ] [ column>> 1+ ] tri at-pos t ] }
-      [ swap hash>> at* ] } case ;
+      [ swap hash>> at* ]
+    } case ;
  
  : with-global-lexer ( quot -- result )
-   [ f lrstack set
-        V{ } clone error-stack set H{ } clone \ heads set
-        H{ } clone \ packrat set ] f make-assoc <lex-hash>
+   [
+       f lrstack set
+       V{ } clone error-stack set H{ } clone \ heads set
+       H{ } clone \ packrat set
+   ] f make-assoc <lex-hash>
     swap bind ; inline
  
-: parse* ( parser -- ast ) compile
-   [ execute [ error-stack get first throw ] unless* ] with-global-lexer
-   ast>> ;
+: parse* ( parser -- ast )
+    compile
+    [ execute [ error-stack get first throw ] unless* ] with-global-lexer
+    ast>> ;
  
-: create-bnf ( name parser -- ) reset-tokenizer [ lexer get skip-blank parse* parsed ] curry
-    define word make-parsing ;
+: create-bnf ( name parser -- )
+    reset-tokenizer [ lexer get skip-blank parse* parsed ] curry
+    define-syntax ;
      
  SYNTAX: ON-BNF:
      CREATE-WORD reset-tokenizer ";ON-BNF" parse-multiline-string parse-ebnf
author	Slava Pestov <slava@slava-pestovs-macbook-pro.local>
	Sat, 21 Mar 2009 23:13:47 +0000 (18:13 -0500)
committer	Slava Pestov <slava@slava-pestovs-macbook-pro.local>
	Sat, 21 Mar 2009 23:13:47 +0000 (18:13 -0500)
basis/html/templates/chloe/components/components.factor		patch \| blob \| history
basis/io/encodings/iso2022/iso2022-tests.factor		patch \| blob \| history
basis/io/encodings/iso2022/iso2022.factor		patch \| blob \| history
basis/regexp/authors.txt		patch \| blob \| history
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/parser/parser-tests.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp-docs.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history
basis/simple-flat-file/simple-flat-file-docs.factor		patch \| blob \| history
basis/simple-flat-file/simple-flat-file.factor		patch \| blob \| history
basis/unicode/breaks/breaks.factor		patch \| blob \| history
basis/unicode/case/case.factor		patch \| blob \| history
basis/unicode/categories/categories-docs.factor		patch \| blob \| history
basis/unicode/categories/categories.factor		patch \| blob \| history
basis/unicode/categories/syntax/authors.txt	[new file with mode: 0755]	patch \| blob
basis/unicode/categories/syntax/summary.txt	[new file with mode: 0644]	patch \| blob
basis/unicode/categories/syntax/syntax-docs.factor	[new file with mode: 0644]	patch \| blob
basis/unicode/categories/syntax/syntax-tests.factor	[new file with mode: 0644]	patch \| blob
basis/unicode/categories/syntax/syntax.factor	[new file with mode: 0644]	patch \| blob
basis/unicode/categories/syntax/tags.txt	[new file with mode: 0755]	patch \| blob
basis/unicode/collation/collation.factor		patch \| blob \| history
basis/unicode/data/data-docs.factor		patch \| blob \| history
basis/unicode/data/data.factor		patch \| blob \| history
basis/unicode/normalize/normalize.factor		patch \| blob \| history
basis/unicode/script/script.factor		patch \| blob \| history
basis/unicode/syntax/authors.txt	[deleted file]	patch \| blob \| history
basis/unicode/syntax/summary.txt	[deleted file]	patch \| blob \| history
basis/unicode/syntax/syntax.factor	[deleted file]	patch \| blob \| history
basis/unicode/syntax/tags.txt	[deleted file]	patch \| blob \| history
basis/unicode/unicode-docs.factor		patch \| blob \| history
basis/xml/char-classes/char-classes.factor		patch \| blob \| history
extra/mason/child/child.factor		patch \| blob \| history
extra/mason/report/report.factor		patch \| blob \| history
extra/multi-methods/multi-methods.factor		patch \| blob \| history
extra/peg-lexer/peg-lexer.factor		patch \| blob \| history