More regexp docs; unix line ending support

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Mon, 16 Mar 2009 22:53:38 +0000 (17:53 -0500)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Mon, 16 Mar 2009 22:53:38 +0000 (17:53 -0500)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Mon, 16 Mar 2009 22:53:38 +0000 (17:53 -0500)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Mon, 16 Mar 2009 22:53:38 +0000 (17:53 -0500)
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor

index ffaed2db62367001df0bec3c848bc9b05133ef84..1c11ed5c7d58070ba5e51d29d48d2fb605963714 100644 (file)
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@@ -37,8 +37,7 @@ C: <with-options> with-options
  TUPLE: options on off ;
  C: <options> options
  
-SINGLETONS: unix-lines dotall multiline comments case-insensitive
-unicode-case reversed-regexp ;
+SINGLETONS: unix-lines dotall multiline case-insensitive reversed-regexp ;
  
  : <maybe> ( term -- term' )
      f <concatenation> 2array <alternation> ;
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index d26ff7f69ceab3e20812c1d96a5f34a3b233456b..e3a177458591bff0d0b99d4ce6f2ebd75e31afef 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
  control-character-class hex-digit-class java-blank-class c-identifier-class
  unmatchable-class terminator-class word-boundary-class ;
  
-SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ;
+SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
  
  TUPLE: range from to ;
  C: <range> range
diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor

index b55cab62946a92c6dc47ceed25bd900e1f1b2ceb..95511965d19e2c8feb94a54b14d4a633eff192f7 100644 (file)
--- a/basis/regexp/compiler/compiler.factor
+++ b/basis/regexp/compiler/compiler.factor
@@ -17,9 +17,6 @@ SYMBOL: backwards?
  M: t question>quot drop [ 2drop t ] ;
  M: f question>quot drop [ 2drop f ] ;
  
-M: not-class question>quot
-    class>> question>quot [ not ] compose ;
-
  M: beginning-of-input question>quot
      drop [ drop zero? ] ;
  
@@ -40,6 +37,12 @@ M: $ question>quot
  M: ^ question>quot
      drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
  
+M: $unix question>quot
+    drop [ { [ length = ] [ ?nth CHAR: \n = ] } 2|| ] ;
+
+M: ^unix question>quot
+    drop [ { [ drop zero? ] [ [ 1- ] dip ?nth CHAR: \n = ] } 2|| ] ;
+
  M: word-break question>quot
      drop [ word-break-at? ] ;
  
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor

index 20be6b87d852678755b071a29ebcb78e97ad9afc..d59d4818ec7ef5926a8dbd13ca4f9c5c61bdf347 100644 (file)
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -60,11 +60,16 @@ GENERIC: modify-epsilon ( tag -- newtag )
  
  M: object modify-epsilon ;
  
+: line-option ( multiline unix-lines default -- option )
+    multiline option? [
+        drop [ unix-lines option? ] 2dip swap ?
+    ] [ 2nip ] if ;
+
  M: $ modify-epsilon
-    multiline option? [ drop end-of-input ] unless ;
+    $unix end-of-input line-option ;
  
  M: ^ modify-epsilon
-    multiline option? [ drop beginning-of-input ] unless ;
+    ^unix beginning-of-input line-option ;
  
  M: tagged-epsilon nfa-node
      clone [ modify-epsilon ] change-tag add-simple-entry ;
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index c6a69f250875a2ddf999844f19c10a0f79dda013..7b2d6af2c1d17afb1fc8cd0de6d73ce5f22330e5 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -2,7 +2,7 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
  combinators regexp.classes strings splitting peg locals accessors
-regexp.ast ;
+regexp.ast unicode.case ;
  IN: regexp.parser
  
  : allowed-char? ( ch -- ? )
@@ -19,20 +19,19 @@ ERROR: bad-number ;
  ERROR: bad-class name ;
  
  : name>class ( name -- class )
-    {
-        { "Lower" letter-class }
-        { "Upper" LETTER-class }
-        { "Alpha" Letter-class }
-        { "ASCII" ascii-class }
-        { "Digit" digit-class }
-        { "Alnum" alpha-class }
-        { "Punct" punctuation-class }
-        { "Graph" java-printable-class }
-        { "Print" java-printable-class }
-        { "Blank" non-newline-blank-class }
-        { "Cntrl" control-character-class }
-        { "XDigit" hex-digit-class }
-        { "Space" java-blank-class }
+    >string >case-fold {
+        { "lower" letter-class }
+        { "upper" LETTER-class }
+        { "alpha" Letter-class }
+        { "ascii" ascii-class }
+        { "digit" digit-class }
+        { "alnum" alpha-class }
+        { "punct" punctuation-class }
+        { "graph" java-printable-class }
+        { "blank" non-newline-blank-class }
+        { "cntrl" control-character-class }
+        { "xdigit" hex-digit-class }
+        { "space" java-blank-class }
          ! TODO: unicode-character-class
      } [ bad-class ] at-error ;
  
@@ -66,11 +65,8 @@ ERROR: bad-class name ;
          { CHAR: i case-insensitive }
          { CHAR: d unix-lines }
          { CHAR: m multiline }
-        { CHAR: n multiline }
          { CHAR: r reversed-regexp }
          { CHAR: s dotall }
-        { CHAR: u unicode-case }
-        { CHAR: x comments }
      } ;
  
  : ch>option ( ch -- singleton )
@@ -101,8 +97,8 @@ CharacterInBracket = !("}") Character
  
  QuotedCharacter = !("\\E") .
  
-Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
-       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
+Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> ]]
+       | "P{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> <negation> ]]
         | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
         | "u" Character:a Character:b Character:c Character:d
              => [[ { a b c d } hex> ensure-number ]]
diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor

index b35f8d1cf31fff64b20a6260810aabd186d0114c..a7cb0a3715d28df75f628c9b23d9f7e472a7ae51 100644 (file)
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -33,20 +33,71 @@ ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
  "Another approach is to use " { $vocab-link "regexp.combinators" } "." ;
  
  ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
-"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. A new addition is the inclusion of a negation operator, with the syntax " { $snippet "(?~foo)" } " to match everything that does not match " { $snippet "foo" } "."
+"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented."
  { $heading "Characters" }
+"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } "for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
+{ $heading "Concatenation, alternation and grouping" }
+"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
  { $heading "Character classes" }
+"Square brackets define a convenient way to refer to a set of characters. For example, " { $snippet "[ab]" } " refers to either a or b. And " { $snippet "[a-z]" } " refers to all of the characters between a and z, in code point order. You can use these together, as in " { $snippet "[ac-fz]" } " which matches all of the characters between c and f, in addition to a and z. Character classes can be negated using a carat, as in " { $snippet "[^a]" } " which matches all characters which are not a."
  { $heading "Predefined character classes" }
+"Several character classes are predefined, both for convenience and because they are too large to represent directly. In Factor regular expressions, all character classes are Unicode-aware."
+{ $table
+    { { $snippet "\\d" } "Digits" }
+    { { $snippet "\\D" } "Not digits" }
+    { { $snippet "\\s" } "Whitespace" }
+    { { $snippet "\\S" } "Not whitespace" }
+    { { $snippet "\\w" } "Word character (alphanumeric or underscore)" }
+    { { $snippet "\\W" } "Not word character" }
+    { { $snippet "\\p{property}" } "Character which fulfils the property" }
+    { { $snippet "\\P{property}" } "Character which does not fulfil the property" } }
+"Properties for " { $snippet "\\p" } " and " { $snippet "\\P" } " (case-insensitive):"
+{ $table
+    { { $snippet "\\p{lower}" } "Lower case letters" }
+    { { $snippet "\\p{upper}" } "Upper case letters" }
+    { { $snippet "\\p{alpha}" } "Letters" }
+    { { $snippet "\\p{ascii}" } "Characters in the ASCII range" }
+    { { $snippet "\\p{alnum}" } "Letters or numbers" }
+    { { $snippet "\\p{punct}" } "Punctuation" }
+    { { $snippet "\\p{blank}" } "Non-newline whitespace" }
+    { { $snippet "\\p{cntrl}" } "Control character" }
+    { { $snippet "\\p{space}" } "Whitespace" }
+    { { $snippet "\\p{xdigit}" } "Hexidecimal digit" } } ! In the future: Unicode
+"Full unicode properties are not yet supported."
  { $heading "Boundaries" }
+"Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
+{ $table
+    { { $snippet "^" } "Beginning of a line" }
+    { { $snippet "$" } "End of a line" }
+    { { $snippet "\\A" } "Beginning of text" }
+    { { $snippet "\\z" } "End of text" }
+    { { $snippet "\\Z" } "Almost end of text: only thing after is newline" }
+    { { $snippet "\\b" } "Word boundary (by Unicode word boundaries)" }
+    { { $snippet "\\b" } "Not word boundary (by Unicode word boundaries)" } }
  { $heading "Greedy quantifiers" }
-{ $heading "Reluctant quantifiers" }
-{ $heading "Posessive quantifiers" }
-{ $heading "Logical operations" }
+"It is possible to have a regular expression which matches a variable number of occurrences of another regular expression."
+{ $table
+    { { $snippet "a*" } "Zero or more occurrences of a" }
+    { { $snippet "a+" } "One or more occurrences of a" }
+    { { $snippet "a?" } "Zero or one occurrences of a" }
+    { { $snippet "a{n}" } "n occurrences of a" }
+    { { $snippet "a{n,}" } "At least n occurrences of a" }
+    { { $snippet "a{,m}" } "At most m occurrences of a" }
+    { { $snippet "a{n,m}" } "Between n and m occurrences of a" } }
+"All of these quantifiers are " { $emphasis "greedy" } ", meaning that they take as many repetitions as possible within the larger regular expression. Reluctant and posessive quantifiers are not yet supported."
  { $heading "Lookaround" }
+"Operators are provided to look ahead and behind the current point in the regular expression. These can be used in any context, but they're the most useful at the beginning or end of a regular expression."
+{ $table
+    { { $snippet "(?=a)" } "Asserts that the current position is immediately followed by a" }
+    { { $snippet "(?!a)" } "Asserts that the current position is not immediately followed by a" }
+    { { $snippet "(?<=a)" } "Asserts that the current position is immediately preceded by a" }
+    { { $snippet "(?<!a)" } "Asserts that the current position is not immediately preceded by a" } }
+{ $heading "Quotation" }
+"To make it convenient to have a long string which uses regexp operators, a special syntax is provided. If a substring begins with " { $snippet "\\Q" } " then everything until " { $snippet "\\E" } " is quoted (escaped). For example, " { $snippet "R/ \\Qfoo\\bar|baz()\\E/" } " matches exactly the string " { $snippet "\"foo\\bar|baz()\"" } "."
  { $heading "Unsupported features" }
  "One missing feature is backreferences. This is because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } ". You can create a new regular expression to match a particular string using " { $vocab-link "regexp.combinators" } " and group capture is available to extract parts of a regular expression match." $nl
  "Another feature is Perl's " { $snippet "\\G" } " syntax, which references the previous match, is not included. This is because that sequence is inherently stateful, and Factor regexps don't hold state." $nl
-"Additionally, none of the operations which embed code into a regexp are supported, as this would require the inclusion of the Factor parser and compiler in any application which wants to expose regexps to the user. None of the casing operations are included, for simplicity." ; ! Also describe syntax, from the beginning
+"None of the operations which embed code into a regexp are supported, as this would require the inclusion of the Factor parser and compiler in any application which wants to expose regexps to the user. None of the casing operations are included of Perl like \\L, for simplicity." ; ! Also describe syntax, from the beginning
  
  ARTICLE: { "regexp" "options" } "Regular expression options"
  "When " { $link { "regexp" "construction" } } ", various options can be provided. Options have single-character names. A string of options has one of the following two forms:"
@@ -58,13 +109,30 @@ $nl
    { "i" { $link case-insensitive } }
    { "d" { $link unix-lines } }
    { "m" { $link multiline } }
-  { "n" { $link multiline } }
-  { "r" { $link reversed-regexp } }
    { "s" { $link dotall } }
-  { "u" { $link unicode-case } }
-  { "x" { $link comments } }
+  { "r" { $link reversed-regexp } }
  } ;
  
+HELP: case-insensitive
+{ $syntax "R/ .../i" }
+{ $description "On regexps, the " { $snippet "i" } " option makes the match case-insenstive. Currently, this is handled incorrectly with respect to Unicode, as characters like ß do not expand into SS in upper case. This should be fixed in a future version." } ;
+
+HELP: unix-lines
+{ $syntax "R/ .../d" }
+{ $description "With this mode, only newlines (" { $snippet "\\n" } ") are recognized for line breaking. This affects " { $snippet "$" } " and " { $snippet "^" } " when in multiline mode." } ;
+
+HELP: multiline
+{ $syntax "R/ .../m" }
+{ $description "This mode makes the zero-width constraints " { $snippet "$" } " and " { $snippet "^" } " match the beginning or end of a line. Otherwise, they only match the beginning or end of the input text. This can be used together with " { $link dotall } "." } ;
+
+HELP: dotall
+{ $syntax "R/ .../s" }
+{ $description "This mode, traditionally called single line mode, makes " { $snippet "." } " match everything, including line breaks. By default, it does not match line breaking characters. This can be used together with " { $link multiline } "." } ;
+
+HELP: reversed-regexp
+{ $syntax "R/ .../r" }
+{ $description "When running a regexp compiled with this mode, matches will start from the end of the input string, going towards the beginning." } ;
+
  ARTICLE: { "regexp" "theory" } "The theory of regular expressions"
  "Far from being just a practical tool invented by Unix hackers, regular expressions were studied formally before computer programs were written to process them." $nl
  "A regular language is a set of strings that is matched by a regular expression, which is defined to have characters and the empty string, along with the operations concatenation, disjunction and Kleene star. Another way to define the class of regular languages is as the class of languages which can be recognized with constant space overhead, ie with a DFA. These two definitions are provably equivalent." $nl
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index a449b3e2f0b0891bbaa01aecdf68cc1642d90784..0836c0988b1a434efb880f7da3061ba2d6fb42ca 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -470,3 +470,13 @@ IN: regexp-tests
  [ t ] [ "abcdefg" "a(?:bcdefg)" <regexp> matches? ] unit-test
  
  [ 3 ] [ "caba" "(?<=b)a" <regexp> first-match from>> ] unit-test
+
+[ t ] [ "\ra" R/ .^a/ms matches? ] unit-test
+[ f ] [ "\ra" R/ .^a/mds matches? ] unit-test
+[ t ] [ "\na" R/ .^a/ms matches? ] unit-test
+[ t ] [ "\na" R/ .^a/mds matches? ] unit-test
+
+[ t ] [ "a\r" R/ a$./ms matches? ] unit-test
+[ f ] [ "a\r" R/ a$./mds matches? ] unit-test
+[ t ] [ "a\n" R/ a$./ms matches? ] unit-test
+[ t ] [ "a\n" R/ a$./mds matches? ] unit-test
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Mon, 16 Mar 2009 22:53:38 +0000 (17:53 -0500)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Mon, 16 Mar 2009 22:53:38 +0000 (17:53 -0500)
basis/regexp/ast/ast.factor		patch \| blob \| history
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/compiler/compiler.factor		patch \| blob \| history
basis/regexp/nfa/nfa.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp-docs.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history