Merge branch 'master' into regexp

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Tue, 3 Mar 2009 18:24:59 +0000 (12:24 -0600)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Tue, 3 Mar 2009 18:24:59 +0000 (12:24 -0600)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Tue, 3 Mar 2009 18:24:59 +0000 (12:24 -0600)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Tue, 3 Mar 2009 18:24:59 +0000 (12:24 -0600)
diff --git a/basis/ascii/ascii.factor b/basis/ascii/ascii.factor

index 193e847d2714ee868e2e195373a067557bcf6b89..bd1b86b2793347fcf56dfa1923b9b87a4184a508 100644 (file)
--- a/basis/ascii/ascii.factor
+++ b/basis/ascii/ascii.factor
@@ -10,7 +10,7 @@ IN: ascii
  : LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline\r
  : digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline\r
  : printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline\r
-: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline\r
+: control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline\r
  : quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline\r
  : Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline\r
  : alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline\r
@@ -20,4 +20,4 @@ IN: ascii
  : >upper ( str -- upper ) [ ch>upper ] map ;\r
  \r
  HINTS: >lower string ;\r
-HINTS: >upper string ;
-\ No newline at end of file
+HINTS: >upper string ;\r
diff --git a/basis/globs/globs-tests.factor b/basis/globs/globs-tests.factor

index 446f1ee0a9bef6a53d0648705c74e1935bbebb22..45eb27ea62e338c433fa1abf82dcfcec8e311e7c 100644 (file)
--- a/basis/globs/globs-tests.factor
+++ b/basis/globs/globs-tests.factor
@@ -14,5 +14,6 @@ USING: tools.test globs ;
  [ f ] [ "foo.java" "*.{xml,txt}" glob-matches? ] unit-test
  [ t ] [ "foo.txt" "*.{xml,txt}" glob-matches? ] unit-test
  [ t ] [ "foo.xml" "*.{xml,txt}" glob-matches? ] unit-test
-[ f ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test
+[ f ] [ "foo." "*.{xml,txt}" glob-matches? ] unit-test
+[ t ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test
  [ t ] [ "foo.{" "*.{" glob-matches? ] unit-test
diff --git a/basis/globs/globs.factor b/basis/globs/globs.factor

index 14ddb0ed9b7cbc7352097cabad4afcf60c8a5bc6..173187574b67f2e67195a70445d7393bd137a9ca 100644 (file)
--- a/basis/globs/globs.factor
+++ b/basis/globs/globs.factor
@@ -1,42 +1,42 @@
-! Copyright (C) 2007 Slava Pestov.
+! Copyright (C) 2007, 2009 Slava Pestov, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: parser-combinators parser-combinators.regexp lists sequences kernel
-promises strings unicode.case ;
+USING: sequences kernel regexp.combinators regexp.matchers strings unicode.case
+peg.ebnf regexp arrays ;
  IN: globs
  
-<PRIVATE
+EBNF: <glob>
  
-: 'char' ( -- parser )
-    [ ",*?" member? not ] satisfy ;
+Character = "\\" .:c => [[ c 1string <literal> ]]
+          | !(","|"}") . => [[ 1string <literal> ]]
  
-: 'string' ( -- parser )
-    'char' <+> [ >lower token ] <@ ;
+RangeCharacter = !("]") .
  
-: 'escaped-char' ( -- parser )
-    "\\" token any-char-parser &> [ 1token ] <@ ;
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <char-range> ]]
+      | RangeCharacter => [[ 1string <literal> ]]
  
-: 'escaped-string' ( -- parser )
-    'string' 'escaped-char' <|> ;
+StartRange = .:a "-" RangeCharacter:b => [[ a b <char-range> ]]
+           | . => [[ 1string <literal> ]]
  
-DEFER: 'term'
+Ranges = StartRange:s Range*:r => [[ r s prefix ]]
  
-: 'glob' ( -- parser )
-    'term' <*> [ <and-parser> ] <@ ;
+CharClass = "^"?:n Ranges:e => [[ e <or> n [ <not> ] when ]]
  
-: 'union' ( -- parser )
-    'glob' "," token nonempty-list-of "{" "}" surrounded-by
-    [ <or-parser> ] <@ ;
+AlternationBody = Concatenation:c "," AlternationBody:a => [[ a c prefix ]]
+                | Concatenation => [[ 1array ]]
  
-LAZY: 'term' ( -- parser )
-    'union'
-    'character-class' <|>
-    "?" token [ drop any-char-parser ] <@ <|>
-    "*" token [ drop any-char-parser <*> ] <@ <|>
-    'escaped-string' <|> ;
+Element = "*" => [[ R/ .*/ ]]
+        | "?" => [[ R/ ./ ]]
+        | "[" CharClass:c "]" => [[ c ]]
+        | "{" AlternationBody:b "}" => [[ b <or> ]]
+        | Character
  
-PRIVATE>
+Concatenation = Element* => [[ <sequence> ]]
  
-: <glob> ( string -- glob ) 'glob' just parse-1 just ;
+End = !(.)
+
+Main = Concatenation End
+
+;EBNF
  
  : glob-matches? ( input glob -- ? )
-    [ >lower ] [ <glob> ] bi* parse nil? not ;
+    [ >case-fold ] bi@ <glob> matches? ;
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor

new file mode 100644 (file)

index 0000000..b804eac
--- /dev/null
+++ b/basis/regexp/ast/ast.factor
@@ -0,0 +1,72 @@
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel arrays accessors fry sequences regexp.classes ;
+FROM: math.ranges => [a,b] ;
+IN: regexp.ast
+
+TUPLE: negation term ;
+C: <negation> negation
+
+TUPLE: from-to n m ;
+C: <from-to> from-to
+
+TUPLE: at-least n ;
+C: <at-least> at-least
+
+TUPLE: tagged-epsilon tag ;
+C: <tagged-epsilon> tagged-epsilon
+
+CONSTANT: epsilon T{ tagged-epsilon }
+
+TUPLE: concatenation first second ;
+
+: <concatenation> ( seq -- concatenation )
+    [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
+
+TUPLE: alternation first second ;
+
+: <alternation> ( seq -- alternation )
+    unclip [ alternation boa ] reduce ;
+
+TUPLE: star term ;
+C: <star> star
+
+TUPLE: with-options tree options ;
+C: <with-options> with-options
+
+TUPLE: options on off ;
+C: <options> options
+
+SINGLETONS: unix-lines dotall multiline comments case-insensitive
+unicode-case reversed-regexp ;
+
+: <maybe> ( term -- term' )
+    f <concatenation> 2array <alternation> ;
+
+: <plus> ( term -- term' )
+    dup <star> 2array <concatenation> ;
+
+: repetition ( n term -- term' )
+    <array> <concatenation> ;
+
+GENERIC: <times> ( term times -- term' )
+M: at-least <times>
+    n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
+M: from-to <times>
+    [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
+
+: char-class ( ranges ? -- term )
+    [ <or-class> ] dip [ <not-class> ] when ;
+
+TUPLE: lookahead term ;
+C: <lookahead> lookahead
+
+TUPLE: lookbehind term ;
+C: <lookbehind> lookbehind
+
+TUPLE: possessive-star term ;
+C: <possessive-star> possessive-star
+
+: <possessive-plus> ( term -- term' )
+    dup <possessive-star> 2array <concatenation> ;
+
diff --git a/basis/regexp/backend/backend.factor b/basis/regexp/backend/backend.factor

deleted file mode 100644 (file)

index 5eff057..0000000
--- a/basis/regexp/backend/backend.factor
+++ /dev/null
@@ -1,27 +0,0 @@
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors hashtables kernel math vectors ;
-IN: regexp.backend
-
-TUPLE: regexp
-    raw
-    { options hashtable }
-    stack
-    parse-tree
-    nfa-table
-    dfa-table
-    minimized-table
-    matchers
-    { nfa-traversal-flags hashtable }
-    { dfa-traversal-flags hashtable }
-    { state integer }
-    { new-states vector }
-    { visited-states hashtable } ;
-
-: reset-regexp ( regexp -- regexp )
-    0 >>state
-    V{ } clone >>stack
-    V{ } clone >>new-states
-    H{ } clone >>visited-states ;
-
-SYMBOL: current-regexp
diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor

new file mode 100644 (file)

index 0000000..4cbb2e7
--- /dev/null
+++ b/basis/regexp/classes/classes-tests.factor
@@ -0,0 +1,25 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.classes tools.test arrays kernel ;
+IN: regexp.classes.tests
+
+[ f ] [ { 1 2 } <and-class> ] unit-test
+[ T{ or-class f { 2 1 } } ] [ { 1 2 } <or-class> ] unit-test
+[ 3 ] [ { 1 2 } <and-class> 3 2array <or-class> ] unit-test
+[ CHAR: A ] [ CHAR: A LETTER-class <primitive-class> 2array <and-class> ] unit-test
+[ CHAR: A ] [ LETTER-class <primitive-class> CHAR: A 2array <and-class> ] unit-test
+[ T{ primitive-class { class LETTER-class } } ] [ CHAR: A LETTER-class <primitive-class> 2array <or-class> ] unit-test
+[ T{ primitive-class { class LETTER-class } } ] [ LETTER-class <primitive-class> CHAR: A 2array <or-class> ] unit-test
+[ t ] [ { t 1 } <or-class> ] unit-test
+[ t ] [ { 1 t } <or-class> ] unit-test
+[ f ] [ { f 1 } <and-class> ] unit-test
+[ f ] [ { 1 f } <and-class> ] unit-test
+[ 1 ] [ { f 1 } <or-class> ] unit-test
+[ 1 ] [ { 1 f } <or-class> ] unit-test
+[ 1 ] [ { t 1 } <and-class> ] unit-test
+[ 1 ] [ { 1 t } <and-class> ] unit-test
+[ 1 ] [ 1 <not-class> <not-class> ] unit-test
+[ 1 ] [ { 1 1 } <and-class> ] unit-test
+[ 1 ] [ { 1 1 } <or-class> ] unit-test
+[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <and-class> ] unit-test
+[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <or-class> ] unit-test
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index 4a807fa51bbc0f815282c086e77d136517707b69..978be2c369bbab199e2c008fdb5de3de250da666 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -1,7 +1,7 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel math math.order words regexp.utils
-unicode.categories combinators.short-circuit ;
+USING: accessors kernel math math.order words combinators locals
+ascii unicode.categories combinators.short-circuit sequences ;
  IN: regexp.classes
  
  SINGLETONS: any-char any-char-no-nl
@@ -11,19 +11,18 @@ ascii-class punctuation-class java-printable-class blank-class
  control-character-class hex-digit-class java-blank-class c-identifier-class
  unmatchable-class terminator-class word-boundary-class ;
  
-SINGLETONS: beginning-of-input beginning-of-line
-end-of-input end-of-line ;
+SINGLETONS: beginning-of-input ^ end-of-input $ ;
  
-MIXIN: node
-TUPLE: character-class-range from to ; INSTANCE: character-class-range node
+TUPLE: range from to ;
+C: <range> range
  
  GENERIC: class-member? ( obj class -- ? )
  
-M: t class-member? ( obj class -- ? ) 2drop f ;
+M: t class-member? ( obj class -- ? ) 2drop t ;
  
-M: integer class-member? ( obj class -- ? ) 2drop f ;
+M: integer class-member? ( obj class -- ? ) = ;
  
-M: character-class-range class-member? ( obj class -- ? )
+M: range class-member? ( obj class -- ? )
      [ from>> ] [ to>> ] bi between? ;
  
  M: any-char class-member? ( obj class -- ? )
@@ -47,16 +46,24 @@ M: ascii-class class-member? ( obj class -- ? )
  M: digit-class class-member? ( obj class -- ? )
      drop digit? ;
  
+: c-identifier-char? ( ch -- ? )
+    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
+
  M: c-identifier-class class-member? ( obj class -- ? )
-    drop
-    { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
+    drop c-identifier-char? ;
  
  M: alpha-class class-member? ( obj class -- ? )
      drop alpha? ;
  
+: punct? ( ch -- ? )
+    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
+
  M: punctuation-class class-member? ( obj class -- ? )
      drop punct? ;
  
+: java-printable? ( ch -- ? )
+    { [ alpha? ] [ punct? ] } 1|| ;
+
  M: java-printable-class class-member? ( obj class -- ? )
      drop java-printable? ;
  
@@ -64,11 +71,24 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
      drop { [ blank? ] [ CHAR: \n = not ] } 1&& ;
  
  M: control-character-class class-member? ( obj class -- ? )
-    drop control-char? ;
+    drop control? ;
+
+: hex-digit? ( ch -- ? )
+    {
+        [ CHAR: A CHAR: F between? ]
+        [ CHAR: a CHAR: f between? ]
+        [ CHAR: 0 CHAR: 9 between? ]
+    } 1|| ;
  
  M: hex-digit-class class-member? ( obj class -- ? )
      drop hex-digit? ;
  
+: java-blank? ( ch -- ? )
+    {
+        CHAR: \s CHAR: \t CHAR: \n
+        HEX: b HEX: 7 CHAR: \r
+    } member? ;
+
  M: java-blank-class class-member? ( obj class -- ? )
      drop java-blank? ;
  
@@ -76,16 +96,100 @@ M: unmatchable-class class-member? ( obj class -- ? )
      2drop f ;
  
  M: terminator-class class-member? ( obj class -- ? )
-    drop {
-        [ CHAR: \r = ]
-        [ CHAR: \n = ]
-        [ CHAR: \u000085 = ]
-        [ CHAR: \u002028 = ]
-        [ CHAR: \u002029 = ]
-    } 1|| ;
+    drop "\r\n\u000085\u002029\u002028" member? ;
  
-M: beginning-of-line class-member? ( obj class -- ? )
+M: ^ class-member? ( obj class -- ? )
      2drop f ;
  
-M: end-of-line class-member? ( obj class -- ? )
+M: $ class-member? ( obj class -- ? )
      2drop f ;
+
+M: f class-member? 2drop f ;
+
+TUPLE: primitive-class class ;
+C: <primitive-class> primitive-class
+
+TUPLE: or-class seq ;
+
+TUPLE: not-class class ;
+
+TUPLE: and-class seq ;
+
+GENERIC: combine-and ( class1 class2 -- combined ? )
+
+: replace-if-= ( object object -- object ? )
+    over = ;
+
+M: object combine-and replace-if-= ;
+
+M: t combine-and
+    drop t ;
+
+M: f combine-and
+    nip t ;
+
+M: not-class combine-and
+    class>> = [ f t ] [ f f ] if ;
+
+M: integer combine-and
+    swap 2dup class-member? [ drop t ] [ 2drop f t ] if ;
+
+GENERIC: combine-or ( class1 class2 -- combined ? )
+
+M: object combine-or replace-if-= ;
+
+M: t combine-or
+    drop f ;
+
+M: f combine-or
+    drop t ;
+
+M: not-class combine-or
+    class>> = [ t t ] [ f f ] if ;
+
+M: integer combine-or
+    2dup swap class-member? [ drop t ] [ 2drop f f ] if ;
+
+: try-combine ( elt1 elt2 quot -- combined/f ? )
+    3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline
+
+:: prefix-combining ( seq elt quot: ( elt1 elt2 -- combined/f ? ) -- newseq )
+    f :> combined!
+    seq [ elt quot try-combine swap combined! ] find drop
+    [ seq remove-nth combined prefix ]
+    [ seq elt prefix ] if* ; inline
+
+:: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq )
+    seq { } [ quot prefix-combining ] reduce
+    dup length {
+        { 0 [ drop empty ] }
+        { 1 [ first ] }
+        [ drop class new swap >>seq ]
+    } case ; inline
+
+: <and-class> ( seq -- class )
+    [ combine-and ] t and-class combine ;
+
+M: and-class class-member?
+    seq>> [ class-member? ] with all? ;
+
+: <or-class> ( seq -- class )
+    [ combine-or ] f or-class combine ;
+
+M: or-class class-member?
+    seq>> [ class-member? ] with any? ;
+
+: <not-class> ( class -- inverse )
+    {
+        { t [ f ] }
+        { f [ t ] }
+        [ dup not-class? [ class>> ] [ not-class boa ] if ]
+    } case ;
+
+M: not-class class-member?
+    class>> class-member? not ;
+
+M: primitive-class class-member?
+    class>> class-member? ;
+
+UNION: class primitive-class not-class or-class range ;
diff --git a/basis/regexp/combinators/authors.txt b/basis/regexp/combinators/authors.txt

new file mode 100644 (file)

index 0000000..f990dd0
--- /dev/null
+++ b/basis/regexp/combinators/authors.txt
@@ -0,0 +1 @@
+Daniel Ehrenberg
diff --git a/basis/regexp/combinators/combinators-docs.factor b/basis/regexp/combinators/combinators-docs.factor

new file mode 100644 (file)

index 0000000..7cb214f
--- /dev/null
+++ b/basis/regexp/combinators/combinators-docs.factor
@@ -0,0 +1,54 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.syntax help.markup regexp strings ;
+IN: regexp.combinators
+
+ABOUT: "regexp.combinators"
+
+ARTICLE: "regexp.combinators" "Regular expression combinators"
+"The " { $vocab-link "regexp.combinators" } " vocabulary defines combinators which can be used to build up regular expressions to match strings. This is in addition to the traditional syntax defined in the " { $vocab-link "regexp" } " vocabulary."
+{ $subsection <literal> }
+{ $subsection <nothing> }
+{ $subsection <or> }
+{ $subsection <and> }
+{ $subsection <not> }
+{ $subsection <sequence> }
+{ $subsection <zero-or-more> }
+{ $subsection <one-or-more> }
+{ $subsection <option> } ;
+
+HELP: <literal>
+{ $values { "string" string } { "regexp" regexp } }
+{ $description "Creates a regular expression which matches the given literal string." } ;
+
+HELP: <nothing>
+{ $values { "value" regexp } }
+{ $description "The empty regular language." } ;
+
+HELP: <or>
+{ $values { "regexps" "a sequence of regular expressions" } { "disjunction" regexp } }
+{ $description "Creates a new regular expression which matches the union of what elements of the sequence match." } ;
+
+HELP: <and>
+{ $values { "regexps" "a sequence of regular expressions" } { "conjunction" regexp } }
+{ $description "Creates a new regular expression which matches the intersection of what elements of the sequence match." } ;
+
+HELP: <sequence>
+{ $values { "regexps" "a sequence of regular expressions" } { "regexp" regexp } }
+{ $description "Creates a new regular expression which matches strings that match each element of the sequence in order." } ;
+
+HELP: <not>
+{ $values { "regexp" regexp } { "not-regexp" regexp } }
+{ $description "Creates a new regular expression which matches everything that the given regexp does not match." } ;
+
+HELP: <one-or-more>
+{ $values { "regexp" regexp } { "regexp+" regexp } }
+{ $description "Creates a new regular expression which matches one or more copies of the given regexp." } ;
+
+HELP: <option>
+{ $values { "regexp" regexp } { "regexp?" regexp } }
+{ $description "Creates a new regular expression which matches zero or one copies of the given regexp." } ;
+
+HELP: <zero-or-more>
+{ $values { "regexp" regexp } { "regexp*" regexp } }
+{ $description "Creates a new regular expression which matches zero or more copies of the given regexp." } ;
diff --git a/basis/regexp/combinators/combinators-tests.factor b/basis/regexp/combinators/combinators-tests.factor

new file mode 100644 (file)

index 0000000..70cc020
--- /dev/null
+++ b/basis/regexp/combinators/combinators-tests.factor
@@ -0,0 +1,29 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.combinators tools.test regexp kernel sequences regexp.matchers ;
+IN: regexp.combinators.tests
+
+: strings ( -- regexp )
+    { "foo" "bar" "baz" } <any-of> ;
+
+[ t t t ] [ "foo" "bar" "baz" [ strings matches? ] tri@ ] unit-test
+[ f f f ] [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
+
+: conj ( -- regexp )
+    { R/ .*a/ R/ b.*/ } <and> ;
+
+[ t ] [ "bljhasflsda" conj matches? ] unit-test
+[ f ] [ "bsdfdfs" conj matches? ] unit-test ! why does this fail?
+[ f ] [ "fsfa" conj matches? ] unit-test
+
+! For some reason, creating this DFA doesn't work
+! [ f ] [ "bljhasflsda" conj <not> matches? ] unit-test
+! [ t ] [ "bsdfdfs" conj <not> matches? ] unit-test
+! [ t ] [ "fsfa" conj <not> matches? ] unit-test
+
+[ f f ] [ "" "hi" [ <nothing> matches? ] bi@ ] unit-test
+[ t t ] [ "" "hi" [ <nothing> <not> matches? ] bi@ ] unit-test
+
+[ { t t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <zero-or-more> matches? ] map ] unit-test
+[ { f t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <one-or-more> matches? ] map ] unit-test
+[ { t t f f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <option> matches? ] map ] unit-test
diff --git a/basis/regexp/combinators/combinators.factor b/basis/regexp/combinators/combinators.factor

new file mode 100644 (file)

index 0000000..2941afd
--- /dev/null
+++ b/basis/regexp/combinators/combinators.factor
@@ -0,0 +1,56 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp sequences kernel regexp.negation regexp.ast
+accessors fry regexp.classes ;
+IN: regexp.combinators
+
+<PRIVATE
+
+: modify-regexp ( regexp raw-quot tree-quot -- new-regexp )
+    [ '[ raw>> @ ] ]
+    [ '[ parse-tree>> @ ] ] bi* bi
+    make-regexp ; inline
+
+PRIVATE>
+
+CONSTANT: <nothing> R/ (?~.*)/
+
+: <literal> ( string -- regexp )
+    [ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
+
+: <char-range> ( char1 char2 -- regexp )
+    [ [ "[" "-" surround ] [ "]" append ] bi* append ]
+    [ <range> ]
+    2bi make-regexp ;
+
+: <or> ( regexps -- disjunction )
+    [ [ raw>> "(" ")" surround ] map "|" join ]
+    [ [ parse-tree>> ] map <alternation> ] bi
+    make-regexp ; foldable
+
+: <any-of> ( strings -- regexp )
+    [ <literal> ] map <or> ; foldable
+
+: <sequence> ( regexps -- regexp )
+    [ [ raw>> ] map concat ]
+    [ [ parse-tree>> ] map <concatenation> ] bi
+    make-regexp ; foldable
+
+: <not> ( regexp -- not-regexp )
+    [ "(?~" ")" surround ]
+    [ <negation> ] modify-regexp ; foldable
+
+: <and> ( regexps -- conjunction )
+    [ <not> ] map <or> <not> ; foldable
+
+: <zero-or-more> ( regexp -- regexp* )
+    [ "(" ")*" surround ]
+    [ <star> ] modify-regexp ; foldable
+
+: <one-or-more> ( regexp -- regexp+ )
+    [ "(" ")+" surround ]
+    [ <plus> ] modify-regexp ; foldable
+
+: <option> ( regexp -- regexp? )
+    [ "(" ")?" surround ]
+    [ <maybe> ] modify-regexp ; foldable
diff --git a/basis/regexp/combinators/summary.txt b/basis/regexp/combinators/summary.txt

new file mode 100644 (file)

index 0000000..1b3fb6c
--- /dev/null
+++ b/basis/regexp/combinators/summary.txt
@@ -0,0 +1 @@
+Combinators for creating regular expressions
diff --git a/basis/regexp/combinators/tags.txt b/basis/regexp/combinators/tags.txt

new file mode 100644 (file)

index 0000000..9da5688
--- /dev/null
+++ b/basis/regexp/combinators/tags.txt
@@ -0,0 +1 @@
+parsing
diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor

new file mode 100644 (file)

index 0000000..7fda010
--- /dev/null
+++ b/basis/regexp/compiler/compiler.factor
@@ -0,0 +1,83 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.classes kernel sequences regexp.negation
+quotations regexp.minimize assocs fry math locals combinators
+accessors words compiler.units kernel.private strings
+sequences.private arrays regexp.matchers call ;
+IN: regexp.compiler
+
+: literals>cases ( literal-transitions -- case-body )
+    [ 1quotation ] assoc-map ;
+
+: non-literals>dispatch ( non-literal-transitions -- quot )
+    [ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map
+    [ 3drop ] suffix '[ _ cond ] ;
+
+: expand-one-or ( or-class transition -- alist )
+    [ seq>> ] dip '[ _ 2array ] map ;
+
+: expand-or ( alist -- new-alist )
+    [
+        first2 over or-class?
+        [ expand-one-or ] [ 2array 1array ] if
+    ] map concat ;
+
+: split-literals ( transitions -- case default )
+    >alist expand-or [ first integer? ] partition
+    [ literals>cases ] [ non-literals>dispatch ] bi* ;
+
+:: step ( last-match index str case-body final? -- last-index/f )
+    final? index last-match ?
+    index str bounds-check? [
+        index 1+ str
+        index str nth-unsafe
+        case-body case
+    ] when ; inline
+
+: transitions>quot ( transitions final-state? -- quot )
+    [ split-literals suffix ] dip
+    '[ { array-capacity string } declare _ _ step ] ;
+
+: word>quot ( word dfa -- quot )
+    [ transitions>> at ]
+    [ final-states>> key? ] 2bi
+    transitions>quot ;
+
+: states>code ( words dfa -- )
+    '[
+        [
+            dup _ word>quot
+            (( last-match index string -- ? ))
+            define-declared
+        ] each
+    ] with-compilation-unit ;
+
+: transitions-at ( transitions assoc -- new-transitions )
+    dup '[
+        [ _ at ]
+        [ [ _ at ] assoc-map ] bi*
+    ] assoc-map ;
+
+: states>words ( dfa -- words dfa )
+    dup transitions>> keys [ gensym ] H{ } map>assoc
+    [ [ transitions-at ] rewrite-transitions ]
+    [ values ]
+    bi swap ; 
+
+: dfa>word ( dfa -- word )
+    states>words [ states>code ] keep start-state>> ;
+
+: check-string ( string -- string )
+    dup string? [ "String required" throw ] unless ;
+
+: run-regexp ( start-index string word -- ? )
+    { [ f ] [ >fixnum ] [ check-string ] [ execute ] } spread ; inline
+
+: dfa>quotation ( dfa -- quot )
+    dfa>word '[ _ run-regexp ] ;
+
+TUPLE: quot-matcher quot ;
+C: <quot-matcher> quot-matcher
+
+M: quot-matcher match-index-from
+    quot>> call( index string -- i/f ) ;
diff --git a/basis/regexp/dfa/dfa-tests.factor b/basis/regexp/dfa/dfa-tests.factor

new file mode 100644 (file)

index 0000000..b6ce13c
--- /dev/null
+++ b/basis/regexp/dfa/dfa-tests.factor
@@ -0,0 +1,5 @@
+USING: regexp.dfa tools.test ;
+IN: regexp.dfa.tests
+
+[ [ ] [ ] while-changes ] must-infer
+
diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor

index 549669cab727328eabd5fd6244d247fb52495160..01e3e01119ddb377d23eb9f299ea68cf27a85ea8 100644 (file)
--- a/basis/regexp/dfa/dfa.factor
+++ b/basis/regexp/dfa/dfa.factor
@@ -1,84 +1,79 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors arrays assocs combinators fry kernel locals
  math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors regexp.utils sequences.deep ;
-USING: io prettyprint threads ;
+sets sorting vectors regexp.ast ;
  IN: regexp.dfa
  
-: find-delta ( states transition regexp -- new-states )
-    nfa-table>> transitions>>
-    rot [ swap at at ] with with gather sift ;
+:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
+    obj quot call :> new-obj
+    new-obj comp call :> new-key
+    new-key old-key =
+    [ new-obj ]
+    [ new-obj quot comp new-key (while-changes) ]
+    if ; inline recursive
  
-: (find-epsilon-closure) ( states regexp -- new-states )
-    eps swap find-delta ;
+: while-changes ( obj quot pred -- obj' )
+    3dup nip call (while-changes) ; inline
  
-: find-epsilon-closure ( states regexp -- new-states )
+: find-delta ( states transition nfa -- new-states )
+    transitions>> '[ _ swap _ at at ] gather sift ;
+
+: (find-epsilon-closure) ( states nfa -- new-states )
+    epsilon swap find-delta ;
+
+: find-epsilon-closure ( states nfa -- new-states )
      '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
      natural-sort ;
  
-: find-closure ( states transition regexp -- new-states )
-    [ find-delta ] 2keep nip find-epsilon-closure ;
+: find-closure ( states transition nfa -- new-states )
+    [ find-delta ] keep find-epsilon-closure ;
  
-: find-start-state ( regexp -- state )
-    [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
+: find-start-state ( nfa -- state )
+    [ start-state>> 1vector ] keep find-epsilon-closure ;
  
-: find-transitions ( seq1 regexp -- seq2 )
-    nfa-table>> transitions>>
-    [ at keys ] curry gather
-    eps swap remove ;
+: find-transitions ( dfa-state nfa -- next-dfa-state )
+    transitions>>
+    '[ _ at keys ] gather
+    epsilon swap remove ;
  
-: add-todo-state ( state regexp -- )
-    2dup visited-states>> key? [
-        2drop
-    ] [
-        [ visited-states>> conjoin ]
-        [ new-states>> push ] 2bi
+: add-todo-state ( state visited-states new-states -- )
+    3dup drop key? [ 3drop ] [
+        [ conjoin ] [ push ] bi-curry* bi
      ] if ;
  
-: new-transitions ( regexp -- )
-    dup new-states>> [
-        drop
-    ] [
-        dupd pop dup pick find-transitions rot
-        [
-            [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
-            [ swapd transition make-transition ] dip
-            dfa-table>> add-transition 
-        ] curry with each
-        new-transitions
+:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
+    new-states [ nfa dfa ] [
+        pop :> state
+        state nfa find-transitions
+        [| trans |
+            state trans nfa find-closure :> new-state
+            new-state visited-states new-states add-todo-state
+            state new-state trans dfa add-transition
+        ] each
+        nfa dfa new-states visited-states new-transitions
      ] if-empty ;
  
  : states ( hashtable -- array )
      [ keys ]
-    [ values [ values concat ] map concat append ] bi ;
+    [ values [ values concat ] map concat ] bi
+    append ;
  
-: set-final-states ( regexp -- )
-    dup
-    [ nfa-table>> final-states>> keys ]
-    [ dfa-table>> transitions>> states ] bi
-    [ intersects? ] with filter
-
-    swap dfa-table>> final-states>>
+: set-final-states ( nfa dfa -- )
+    [
+        [ final-states>> keys ]
+        [ transitions>> states ] bi*
+        [ intersects? ] with filter
+    ] [ final-states>> ] bi
      [ conjoin ] curry each ;
  
-: set-initial-state ( regexp -- )
-    dup
-    [ dfa-table>> ] [ find-start-state ] bi
-    [ >>start-state drop ] keep
-    1vector >>new-states drop ;
-
-: set-traversal-flags ( regexp -- )
-    dup
-    [ nfa-traversal-flags>> ]
-    [ dfa-table>> transitions>> keys ] bi
-    [ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
-    >>dfa-traversal-flags drop ;
+: initialize-dfa ( nfa -- dfa )
+    <transition-table>
+        swap find-start-state >>start-state ;
  
-: construct-dfa ( regexp -- )
-    {
-        [ set-initial-state ]
-        [ new-transitions ]
-        [ set-final-states ]
-        [ set-traversal-flags ]
-    } cleave ;
+: construct-dfa ( nfa -- dfa )
+    dup initialize-dfa
+    dup start-state>> 1vector
+    H{ } clone
+    new-transitions
+    [ set-final-states ] keep ;
diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor

new file mode 100644 (file)

index 0000000..b8c03d7
--- /dev/null
+++ b/basis/regexp/disambiguate/disambiguate.factor
@@ -0,0 +1,41 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel accessors regexp.classes math.bits assocs sequences
+arrays sets regexp.dfa math fry regexp.minimize ;
+IN: regexp.disambiguate
+
+TUPLE: parts in out ;
+
+: make-partition ( choices classes -- partition )
+    zip [ first ] partition [ values ] bi@ parts boa ;
+
+: powerset-partition ( classes -- partitions )
+    [ length [ 2^ ] keep ] keep '[
+        _ <bits> _ make-partition
+    ] map rest ;
+
+: partition>class ( parts -- class )
+    [ out>> [ <not-class> ] map ]
+    [ in>> <and-class> ] bi
+    prefix <and-class> ;
+
+: get-transitions ( partition state-transitions -- next-states )
+    [ in>> ] dip '[ _ at ] map prune ;
+
+: disambiguate ( dfa -- nfa )  
+    [
+        [
+            [ keys powerset-partition ] keep '[
+                [ partition>class ]
+                [ _ get-transitions ] bi
+            ] H{ } map>assoc
+            [ drop ] assoc-filter 
+        ] assoc-map
+    ] change-transitions ;
+
+USE: sorting
+
+: nfa>dfa ( nfa -- dfa )
+    construct-dfa minimize
+    disambiguate
+    construct-dfa minimize ;
diff --git a/basis/regexp/matchers/matchers.factor b/basis/regexp/matchers/matchers.factor

new file mode 100644 (file)

index 0000000..4b5f291
--- /dev/null
+++ b/basis/regexp/matchers/matchers.factor
@@ -0,0 +1,64 @@
+! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel sequences math splitting make fry locals math.ranges
+accessors arrays ;
+IN: regexp.matchers
+
+! For now, a matcher is just something with a method to do the
+! equivalent of match.
+
+GENERIC: match-index-from ( i string matcher -- index/f )
+
+: match-index-head ( string matcher -- index/f )
+    [ 0 ] 2dip match-index-from ;
+
+: match-slice ( i string matcher -- slice/f )
+    [ 2dup ] dip match-index-from
+    [ swap <slice> ] [ 2drop f ] if* ;
+
+: matches? ( string matcher -- ? )
+    dupd match-index-head
+    [ swap length = ] [ drop f ] if* ;
+
+: map-find ( seq quot -- result elt )
+    [ f ] 2dip
+    '[ nip @ dup ] find
+    [ [ drop f ] unless ] dip ; inline
+
+:: match-from ( i string matcher -- slice/f )
+    i string length [a,b)
+    [ string matcher match-slice ] map-find drop ;
+
+: match-head ( str matcher -- slice/f )
+    [ 0 ] 2dip match-from ;
+
+<PRIVATE
+
+: next-match ( i string matcher -- i match/f )
+    match-from [ dup [ to>> ] when ] keep ;
+
+PRIVATE>
+
+:: all-matches ( string matcher -- seq )
+    0 [ dup ] [ string matcher next-match ] [ ] produce nip but-last ;
+
+: count-matches ( string matcher -- n )
+    all-matches length ;
+
+<PRIVATE
+
+:: split-slices ( string slices -- new-slices )
+    slices [ to>> ] map 0 prefix
+    slices [ from>> ] map string length suffix
+    [ string <slice> ] 2map ;
+
+PRIVATE>
+
+: re-split1 ( string matcher -- before after/f )
+    dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
+
+: re-split ( string matcher -- seq )
+    dupd all-matches split-slices ;
+
+: re-replace ( string matcher replacement -- result )
+    [ re-split ] dip join ;
diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor

new file mode 100644 (file)

index 0000000..ece7c8f
--- /dev/null
+++ b/basis/regexp/minimize/minimize-tests.factor
@@ -0,0 +1,49 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.minimize assocs regexp
+accessors regexp.transition-tables ;
+IN: regexp.minimize.tests
+
+[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test
+[ t ] [ 2 1 H{ { { 1 2 } t } } same-partition? ] unit-test
+[ f ] [ 2 3 H{ { { 1 2 } t } } same-partition? ] unit-test
+
+[ H{ { 1 1 } { 2 1 } { 3 3 } { 4 3 } } ]
+[ { { 1 1 } { 1 2 } { 2 2 } { 3 3 } { 3 4 } { 4 4 } } [ t ] H{ } map>assoc partition>classes ] unit-test
+
+[ { { 1 2 } { 3 4 } } ] [ H{ { "elephant" 1 } { "tiger" 3 } } H{ { "elephant" 2 } { "tiger" 4 } } assemble-values ] unit-test
+
+[ 3 ] [ R/ ab|ac/ dfa>> transitions>> assoc-size ] unit-test
+[ 3 ] [ R/ a(b|c)/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ ((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ a|((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test
+[ 2 ] [ R/ ab|((aa*)*)*b/ dfa>> transitions>> assoc-size ] unit-test
+[ 4 ] [ R/ ab|cd/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ [a-z]*|[A-Z]*/i dfa>> transitions>> assoc-size ] unit-test
+
+[
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { CHAR: b 1 } } }
+            { 1 H{ { CHAR: a 2 } { CHAR: b 2 } } }
+            { 2 H{ { CHAR: c 3 } } }
+            { 3 H{ } }
+        } }
+        { start-state 0 }
+        { final-states H{ { 3 3 } } }
+    }
+] [ 
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { CHAR: b 4 } } }
+            { 1 H{ { CHAR: a 2 } { CHAR: b 5 } } }
+            { 2 H{ { CHAR: c 3 } } }
+            { 3 H{ } }
+            { 4 H{ { CHAR: a 2 } { CHAR: b 5 } } }
+            { 5 H{ { CHAR: c 6 } } }
+            { 6 H{ } }
+        } }
+        { start-state 0 }
+        { final-states H{ { 3 3 } { 6 6 } } }
+    } combine-states
+] unit-test
diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor

new file mode 100644 (file)

index 0000000..c88c2a8
--- /dev/null
+++ b/basis/regexp/minimize/minimize.factor
@@ -0,0 +1,91 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel sequences regexp.transition-tables fry assocs
+accessors locals math sorting arrays sets hashtables regexp.dfa
+combinators.short-circuit ;
+IN: regexp.minimize
+
+: number-transitions ( transitions numbering -- new-transitions )
+    dup '[
+        [ _ at ]
+        [ [ first _ at ] assoc-map ] bi*
+    ] assoc-map ;
+
+: table>state-numbers ( table -- assoc )
+    transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
+
+: map-set ( assoc quot -- new-assoc )
+    '[ drop @ dup ] assoc-map ; inline
+
+: rewrite-transitions ( transition-table assoc quot -- transition-table )
+    [
+        [ clone ] dip
+        [ '[ _ at ] change-start-state ]
+        [ '[ [ _ at ] map-set ] change-final-states ]
+        [ ] tri
+    ] dip '[ _ @ ] change-transitions ; inline
+
+: number-states ( table -- newtable )
+    dup table>state-numbers
+    [ number-transitions ] rewrite-transitions ;
+
+: initially-same? ( s1 s2 transition-table -- ? )
+    {
+        [ drop <= ]
+        [ transitions>> '[ _ at keys ] bi@ set= ]
+        [ final-states>> '[ _ key? ] bi@ = ]
+    } 3&& ;
+
+:: initialize-partitions ( transition-table -- partitions )
+    ! Partition table is sorted-array => ?
+    H{ } clone :> out
+    transition-table transitions>> keys :> states
+    states [| s1 |
+        states [| s2 |
+            s1 s2 transition-table initially-same?
+            [ s1 s2 2array out conjoin ] when
+        ] each
+    ] each out ;
+
+: same-partition? ( s1 s2 partitions -- ? )
+    [ 2array natural-sort ] dip key? ;
+
+: assemble-values ( assoc1 assoc2 -- values )
+    dup keys '[ _ swap [ at ] curry map ] bi@ zip ;
+
+: stay-same? ( s1 s2 transition partitions -- ? )
+    [ '[ _ transitions>> at ] bi@ assemble-values ] dip
+    '[ _ same-partition? ] assoc-all? ;
+
+: partition-more ( partitions transition-table -- partitions )
+    over '[ drop first2 _ _ stay-same? ] assoc-filter ;
+
+: partition>classes ( partitions -- synonyms ) ! old-state => new-state
+    >alist sort-keys
+    [ drop first2 swap ] assoc-map
+    <reversed>
+    >hashtable ;
+
+: state-classes ( transition-table -- synonyms )
+    [ initialize-partitions ] keep
+    '[ _ partition-more ] [ assoc-size ] while-changes
+    partition>classes ;
+
+: canonical-state? ( state state-classes -- ? )
+    dupd at = ;
+
+: delete-duplicates ( transitions state-classes -- new-transitions )
+    '[ drop _ canonical-state? ] assoc-filter ;
+
+: rewrite-duplicates ( new-transitions state-classes -- new-transitions )
+    '[ [ _ at ] assoc-map ] assoc-map ;
+
+: combine-transitions ( transitions state-classes -- new-transitions )
+    [ delete-duplicates ] [ rewrite-duplicates ] bi ;
+
+: combine-states ( table -- smaller-table )
+    dup state-classes
+    [ combine-transitions ] rewrite-transitions ;
+
+: minimize ( table -- minimal-table )
+    clone number-states combine-states ;
diff --git a/basis/regexp/negation/negation-tests.factor b/basis/regexp/negation/negation-tests.factor

new file mode 100644 (file)

index 0000000..41dfe7f
--- /dev/null
+++ b/basis/regexp/negation/negation-tests.factor
@@ -0,0 +1,27 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
+IN: regexp.negation.tests
+
+[
+    ! R/ |[^a]|.+/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { T{ not-class f CHAR: a } -1 } } }
+            { 1 H{ { t -1 } } }
+            { -1 H{ { t -1 } } }
+        } } 
+        { start-state 0 }
+        { final-states H{ { 0 0 } { -1 -1 } } }
+    }
+] [
+    ! R/ a/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } } }
+            { 1 H{ } } 
+        } }
+        { start-state 0 }
+        { final-states H{ { 1 1 } } }
+    } negate-table
+] unit-test
diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor

new file mode 100644 (file)

index 0000000..0cfcdfc
--- /dev/null
+++ b/basis/regexp/negation/negation.factor
@@ -0,0 +1,61 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.nfa regexp.disambiguate kernel sequences
+assocs regexp.classes hashtables accessors fry vectors
+regexp.ast regexp.transition-tables regexp.minimize namespaces ;
+IN: regexp.negation
+
+: ast>dfa ( parse-tree -- minimal-dfa )
+    construct-nfa nfa>dfa ;
+
+CONSTANT: fail-state -1
+
+: add-default-transition ( state's-transitions -- new-state's-transitions )
+    clone dup
+    [ [ fail-state ] dip keys [ <not-class> ] map <and-class> ] keep set-at ;
+
+: fail-state-recurses ( transitions -- new-transitions )
+    clone dup
+    [ fail-state t associate fail-state ] dip set-at ;
+
+: add-fail-state ( transitions -- new-transitions )
+    [ add-default-transition ] assoc-map
+    fail-state-recurses ;
+
+: assoc>set ( assoc -- keys-set )
+    [ drop dup ] assoc-map ;
+
+: inverse-final-states ( transition-table -- final-states )
+    [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
+
+: negate-table ( transition-table -- transition-table )
+    clone
+        [ add-fail-state ] change-transitions
+        dup inverse-final-states >>final-states ;
+
+: renumber-transitions ( transitions numbering -- new-transitions )
+    dup '[
+        [ _ at ]
+        [ [ [ _ at ] map ] assoc-map ] bi*
+    ] assoc-map ;
+
+: renumber-states ( transition-table -- transition-table )
+    dup transitions>> keys [ next-state ] H{ } map>assoc
+    [ renumber-transitions ] rewrite-transitions ;
+
+: box-transitions ( transition-table -- transition-table )
+    [ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ;
+
+: unify-final-state ( transition-table -- transition-table )
+    dup [ final-states>> keys ] keep
+    '[ -2 epsilon _ add-transition ] each
+    H{ { -2 -2 } } >>final-states ;
+
+: adjoin-dfa ( transition-table -- start end )
+    box-transitions unify-final-state renumber-states
+    [ start-state>> ]
+    [ final-states>> keys first ]
+    [ nfa-table get [ transitions>> ] bi@ swap update ] tri ;
+
+M: negation nfa-node ( node -- start end )
+    term>> ast>dfa negate-table adjoin-dfa ;
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor

index 537c85c2d3b20acfd305a3903bab4b27a3a08667..55147a1d269fca1cc72d2e694de28273cecab5bf 100644 (file)
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -1,235 +1,155 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs grouping kernel regexp.backend
-locals math namespaces regexp.parser sequences fry quotations
-math.order math.ranges vectors unicode.categories regexp.utils
-regexp.transition-tables words sets regexp.classes unicode.case.private ;
+USING: accessors arrays assocs grouping kernel
+locals math namespaces sequences fry quotations
+math.order math.ranges vectors unicode.categories
+regexp.transition-tables words sets hashtables combinators.short-circuit
+unicode.case.private regexp.ast regexp.classes ;
+IN: regexp.nfa
+
  ! This uses unicode.case.private for ch>upper and ch>lower
  ! but case-insensitive matching should be done by case-folding everything
  ! before processing starts
-IN: regexp.nfa
  
-ERROR: feature-is-broken feature ;
-
-SYMBOL: negation-mode
-: negated? ( -- ? ) negation-mode get 0 or odd? ; 
-
-SINGLETON: eps
-
-MIXIN: traversal-flag
-SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
-SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
-SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
-SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
-SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
-SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
-SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
-SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
-SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
-
-: options ( -- obj ) current-regexp get options>> ;
-
-: option? ( obj -- ? ) options key? ;
-
-: option-on ( obj -- ) options conjoin ;
-
-: option-off ( obj -- ) options delete-at ;
-
-: next-state ( regexp -- state )
-    [ state>> ] [ [ 1+ ] change-state drop ] bi ;
-
-: set-start-state ( regexp -- )
-    dup stack>> [
-        drop
-    ] [
-        [ nfa-table>> ] [ pop first ] bi* >>start-state drop
-    ] if-empty ;
-
-GENERIC: nfa-node ( node -- )
-
-:: add-simple-entry ( obj class -- )
-    [let* | regexp [ current-regexp get ]
-            s0 [ regexp next-state ]
-            s1 [ regexp next-state ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ] |
-        negated? [
-            s0 f obj class make-transition table add-transition
-            s0 s1 <default-transition> table add-transition
-        ] [
-            s0 s1 obj class make-transition table add-transition
-        ] if
-        s0 s1 2array stack push
-        t s1 table final-states>> set-at ] ;
-
-: add-traversal-flag ( flag -- )
-    stack peek second
-    current-regexp get nfa-traversal-flags>> push-at ;
-
-:: concatenate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ] |
-        s1 s2 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s0 s3 2array stack push ] ;
-
-:: alternate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s4 [ regexp next-state ]
-            s5 [ regexp next-state ] |
-        s4 s0 eps <literal-transition> table add-transition
-        s4 s2 eps <literal-transition> table add-transition
-        s1 s5 eps <literal-transition> table add-transition
-        s3 s5 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s3 table final-states>> delete-at
-        t s5 table final-states>> set-at
-        s4 s5 2array stack push ] ;
-
-M: kleene-star nfa-node ( node -- )
-    term>> nfa-node
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s2 [ regexp next-state ]
-            s3 [ regexp next-state ]
-            table [ regexp nfa-table>> ] |
-        s1 table final-states>> delete-at
-        t s3 table final-states>> set-at
-        s1 s0 eps <literal-transition> table add-transition
-        s2 s0 eps <literal-transition> table add-transition
-        s2 s3 eps <literal-transition> table add-transition
-        s1 s3 eps <literal-transition> table add-transition
-        s2 s3 2array stack push ] ;
-
-M: concatenation nfa-node ( node -- )
-    seq>>
-    reversed-regexp option? [ <reversed> ] when
-    [ [ nfa-node ] each ]
-    [ length 1- [ concatenate-nodes ] times ] bi ;
-
-M: alternation nfa-node ( node -- )
-    seq>>
-    [ [ nfa-node ] each ]
-    [ length 1- [ alternate-nodes ] times ] bi ;
-
-M: constant nfa-node ( node -- )
-    case-insensitive option? [
-        dup char>> [ ch>lower ] [ ch>upper ] bi
-        2dup = [
-            2drop
-            char>> literal-transition add-simple-entry
-        ] [
-            [ literal-transition add-simple-entry ] bi@
-            alternate-nodes drop
-        ] if
-    ] [
-        char>> literal-transition add-simple-entry
-    ] if ;
+GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
+! This is unfinished and does nothing right now!
+
+M: object remove-lookahead ;
+
+M: with-options remove-lookahead
+    [ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
+
+M: alternation remove-lookahead
+    [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ alternation boa ;
+
+M: concatenation remove-lookahead ;
+
+SYMBOL: option-stack
+
+SYMBOL: state
+
+: next-state ( -- state )
+    state [ get ] [ inc ] bi ;
+
+SYMBOL: nfa-table
+
+: set-each ( keys value hashtable -- )
+    '[ _ swap _ set-at ] each ;
+
+: options>hash ( options -- hashtable )
+    H{ } clone [
+        [ [ on>> t ] dip set-each ]
+        [ [ off>> f ] dip set-each ] 2bi
+    ] keep ;
+
+: using-options ( options quot -- )
+    [ options>hash option-stack [ ?push ] change ] dip
+    call option-stack get pop* ; inline
  
-M: epsilon nfa-node ( node -- )
-    drop eps literal-transition add-simple-entry ;
+: option? ( obj -- ? )
+    option-stack get assoc-stack ;
  
-M: word nfa-node ( node -- ) class-transition add-simple-entry ;
+GENERIC: nfa-node ( node -- start-state end-state )
  
-M: any-char nfa-node ( node -- )
-    [ dotall option? ] dip any-char-no-nl ?
-    class-transition add-simple-entry ;
+: add-simple-entry ( obj -- start-state end-state )
+    [ next-state next-state 2dup ] dip
+    nfa-table get add-transition ;
  
-! M: beginning-of-text nfa-node ( node -- ) ;
+: epsilon-transition ( source target -- )
+    epsilon nfa-table get add-transition ;
  
-M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M:: star nfa-node ( node -- start end )
+    node term>> nfa-node :> s1 :> s0
+    next-state :> s2
+    next-state :> s3
+    s1 s0 epsilon-transition
+    s2 s0 epsilon-transition
+    s2 s3 epsilon-transition
+    s1 s3 epsilon-transition
+    s2 s3 ;
  
-M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: tagged-epsilon nfa-node
+    add-simple-entry ;
  
-: choose-letter-class ( node -- node' )
-    case-insensitive option? Letter-class rot ? ;
+M: concatenation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    reversed-regexp option? [ swap ] when
+    [ nfa-node ] bi@
+    [ epsilon-transition ] dip ;
  
-M: letter-class nfa-node ( node -- )
-    choose-letter-class class-transition add-simple-entry ;
+:: alternate-nodes ( s0 s1 s2 s3 -- start end )
+    next-state :> s4
+    next-state :> s5
+    s4 s0 epsilon-transition
+    s4 s2 epsilon-transition
+    s1 s5 epsilon-transition
+    s3 s5 epsilon-transition
+    s4 s5 ;
  
-M: LETTER-class nfa-node ( node -- )
-    choose-letter-class class-transition add-simple-entry ;
+M: alternation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    [ nfa-node ] bi@
+    alternate-nodes ;
  
-M: character-class-range nfa-node ( node -- )
+GENERIC: modify-class ( char-class -- char-class' )
+
+M: object modify-class ;
+
+M: integer modify-class
+    case-insensitive option? [
+        dup Letter? [
+            [ ch>lower ] [ ch>upper ] bi 2array <or-class>
+        ] when
+    ] when ;
+
+M: integer nfa-node ( node -- start end )
+    modify-class add-simple-entry ;
+
+M: primitive-class modify-class
+    class>> modify-class <primitive-class> ;
+
+M: or-class modify-class
+    seq>> [ modify-class ] map <or-class> ;
+
+M: not-class modify-class
+    class>> modify-class <not-class> ;
+
+M: any-char modify-class
+    drop dotall option? t any-char-no-nl ? ;
+
+: modify-letter-class ( class -- newclass )
+    case-insensitive option? [ drop Letter-class ] when ;
+M: letter-class modify-class modify-letter-class ;
+M: LETTER-class modify-class modify-letter-class ;
+
+: cased-range? ( range -- ? )
+    [ from>> ] [ to>> ] bi {
+        [ [ letter? ] bi@ and ]
+        [ [ LETTER? ] bi@ and ]
+    } 2|| ;
+
+M: range modify-class
      case-insensitive option? [
-        ! This should be implemented for Unicode by case-folding
-        ! the input and all strings in the regexp.
-        dup [ from>> ] [ to>> ] bi
-        2dup [ Letter? ] bi@ and [
-            rot drop
-            [ [ ch>lower ] bi@ character-class-range boa ]
-            [ [ ch>upper ] bi@ character-class-range boa ] 2bi 
-            [ class-transition add-simple-entry ] bi@
-            alternate-nodes
-        ] [
-            2drop
-            class-transition add-simple-entry
-        ] if
-    ] [
-        class-transition add-simple-entry
-    ] if ;
-
-M: capture-group nfa-node ( node -- )
-    "capture-groups" feature-is-broken
-    eps literal-transition add-simple-entry
-    capture-group-on add-traversal-flag
-    term>> nfa-node
-    eps literal-transition add-simple-entry
-    capture-group-off add-traversal-flag
-    2 [ concatenate-nodes ] times ;
-
-! xyzzy
-M: non-capture-group nfa-node ( node -- )
-    term>> nfa-node ;
-
-M: reluctant-kleene-star nfa-node ( node -- )
-    term>> <kleene-star> nfa-node ;
-
-M: negation nfa-node ( node -- )
-    negation-mode inc
-    term>> nfa-node 
-    negation-mode dec ;
-
-M: lookahead nfa-node ( node -- )
-    "lookahead" feature-is-broken
-    eps literal-transition add-simple-entry
-    lookahead-on add-traversal-flag
-    term>> nfa-node
-    eps literal-transition add-simple-entry
-    lookahead-off add-traversal-flag
-    2 [ concatenate-nodes ] times ;
-
-M: lookbehind nfa-node ( node -- )
-    "lookbehind" feature-is-broken
-    eps literal-transition add-simple-entry
-    lookbehind-on add-traversal-flag
-    term>> nfa-node
-    eps literal-transition add-simple-entry
-    lookbehind-off add-traversal-flag
-    2 [ concatenate-nodes ] times ;
-
-M: option nfa-node ( node -- )
-    [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
-    eps literal-transition add-simple-entry ;
-
-: construct-nfa ( regexp -- )
+        dup cased-range? [
+            [ from>> ] [ to>> ] bi
+            [ [ ch>lower ] bi@ <range> ]
+            [ [ ch>upper ] bi@ <range> ] 2bi 
+            2array <or-class>
+        ] when
+    ] when ;
+
+M: class nfa-node
+    modify-class add-simple-entry ;
+
+M: with-options nfa-node ( node -- start end )
+    dup options>> [ tree>> nfa-node ] using-options ;
+
+: construct-nfa ( ast -- nfa-table )
      [
-        reset-regexp
-        negation-mode off
-        [ current-regexp set ]
-        [ parse-tree>> nfa-node ]
-        [ set-start-state ] tri
+        0 state set
+        <transition-table> nfa-table set
+        remove-lookahead nfa-node
+        nfa-table get
+            swap dup associate >>final-states
+            swap >>start-state
      ] with-scope ;
diff --git a/basis/regexp/parser/parser-tests.factor b/basis/regexp/parser/parser-tests.factor

index fe4d2f1d1a877d141c679519b22a8eb4e58df88e..d606015f617e19e5e3a181174e0425df838593c1 100644 (file)
--- a/basis/regexp/parser/parser-tests.factor
+++ b/basis/regexp/parser/parser-tests.factor
@@ -1,34 +1,24 @@
-USING: kernel tools.test regexp.backend regexp ;
-IN: regexp.parser
+USING: kernel tools.test regexp.parser fry sequences ;
+IN: regexp.parser.tests
  
-: test-regexp ( string -- )
-    default-regexp parse-regexp ;
+: regexp-parses ( string -- )
+    [ [ ] ] dip '[ _ parse-regexp drop ] unit-test ;
  
-! [ "(" ] [ unmatched-parentheses? ] must-fail-with
+: regexp-fails ( string -- )
+    '[ _ parse-regexp ] must-fail ;
  
-[ ] [ "a|b" test-regexp ] unit-test
-[ ] [ "a.b" test-regexp ] unit-test
-[ ] [ "a|b|c" test-regexp ] unit-test
-[ ] [ "abc|b" test-regexp ] unit-test
-[ ] [ "a|bcd" test-regexp ] unit-test
-[ ] [ "a|(b)" test-regexp ] unit-test
-[ ] [ "(a)|b" test-regexp ] unit-test
-[ ] [ "(a|b)" test-regexp ] unit-test
-[ ] [ "((a)|(b))" test-regexp ] unit-test
+{
+    "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
+    "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
+    "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
+    "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
+    "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
+    "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
+    "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
+    "\\ueeee" "\\0333" "\\xff" "\\\\" "\\w"
+} [ regexp-parses ] each
  
-[ ] [ "(?:a)" test-regexp ] unit-test
-[ ] [ "(?i:a)" test-regexp ] unit-test
-[ ] [ "(?-i:a)" test-regexp ] unit-test
-[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
-[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
-
-[ ] [ "(?=a)" test-regexp ] unit-test
-
-[ ] [ "[abc]" test-regexp ] unit-test
-[ ] [ "[a-c]" test-regexp ] unit-test
-[ ] [ "[^a-c]" test-regexp ] unit-test
-[ "[^]" test-regexp ] must-fail
-
-[ ] [ "|b" test-regexp ] unit-test
-[ ] [ "b|" test-regexp ] unit-test
-[ ] [ "||" test-regexp ] unit-test
+{
+    "[^]" "[]" "a{foo}" "a{,}" "a{}" "(?)" "\\p{foo}" "\\P{foo}"
+    "\\ueeeg" "\\0339" "\\xfg"
+} [ regexp-fails ] each
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index 377535eccd1aac074ac4b39bbfc18472c860bcc5..ed0762cc3ab39bb61b3f46c870345f1178ecc39d 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -1,437 +1,175 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs combinators io io.streams.string
-kernel math math.parser namespaces sets
-quotations sequences splitting vectors math.order
-strings regexp.backend regexp.utils
-unicode.case unicode.categories words locals regexp.classes ;
+USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
+combinators regexp.classes strings splitting peg locals accessors
+regexp.ast ;
  IN: regexp.parser
  
-FROM: math.ranges => [a,b] ;
-
-TUPLE: concatenation seq ; INSTANCE: concatenation node
-TUPLE: alternation seq ; INSTANCE: alternation node
-TUPLE: kleene-star term ; INSTANCE: kleene-star node
-
-! !!!!!!!!
-TUPLE: possessive-question term ; INSTANCE: possessive-question node
-TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
-
-! !!!!!!!!
-TUPLE: reluctant-question term ; INSTANCE: reluctant-question node
-TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
-
-TUPLE: negation term ; INSTANCE: negation node
-TUPLE: constant char ; INSTANCE: constant node
-TUPLE: range from to ; INSTANCE: range node
-
-MIXIN: parentheses-group
-TUPLE: lookahead term ; INSTANCE: lookahead node
-INSTANCE: lookahead parentheses-group
-TUPLE: lookbehind term ; INSTANCE: lookbehind node
-INSTANCE: lookbehind parentheses-group
-TUPLE: capture-group term ; INSTANCE: capture-group node
-INSTANCE: capture-group parentheses-group
-TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
-INSTANCE: non-capture-group parentheses-group
-TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
-INSTANCE: independent-group parentheses-group
-TUPLE: comment-group term ; INSTANCE: comment-group node
-INSTANCE: comment-group parentheses-group
-
-SINGLETON: epsilon INSTANCE: epsilon node
-
-TUPLE: option option on? ; INSTANCE: option node
-
-SINGLETONS: unix-lines dotall multiline comments case-insensitive
-unicode-case reversed-regexp ;
-
-SINGLETONS: beginning-of-character-class end-of-character-class
-left-parenthesis pipe caret dash ;
-
-: push1 ( obj -- ) input-stream get stream>> push ;
-: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
-: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
-: drop1 ( -- ) read1 drop ;
-
-: stack ( -- obj ) current-regexp get stack>> ;
-: change-whole-stack ( quot -- )
-    current-regexp get
-    [ stack>> swap call ] keep (>>stack) ; inline
-: push-stack ( obj -- ) stack push ;
-: pop-stack ( -- obj ) stack pop ;
-: cut-out ( vector n -- vector' vector ) cut rest ;
-ERROR: cut-stack-error ;
-: cut-stack ( obj vector -- vector' vector )
-    [ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ;
-
-: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
-: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
-: <possessive-question> ( obj -- kleene ) possessive-question boa ;
-: <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
-
-: <negation> ( obj -- negation ) negation boa ;
-: <concatenation> ( seq -- concatenation )
-    >vector [ epsilon ] [ concatenation boa ] if-empty ;
-: <alternation> ( seq -- alternation ) >vector alternation boa ;
-: <capture-group> ( obj -- capture-group ) capture-group boa ;
-: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
-: <constant> ( obj -- constant ) constant boa ;
-
-: first|concatenation ( seq -- first/concatenation )
-    dup length 1 = [ first ] [ <concatenation> ] if ;
-
-: first|alternation ( seq -- first/alternation )
-    dup length 1 = [ first ] [ <alternation> ] if ;
-
-: <character-class-range> ( from to -- obj )
-    2dup <
-    [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
-
-ERROR: unmatched-parentheses ;
-
-ERROR: unknown-regexp-option option ;
+: allowed-char? ( ch -- ? )
+    ".()|[*+?$^" member? not ;
  
-: ch>option ( ch -- singleton )
+ERROR: bad-number ;
+
+: ensure-number ( n -- n )
+    [ bad-number ] unless* ;
+
+:: at-error ( key assoc quot: ( key -- replacement ) -- value )
+    key assoc at* [ drop key quot call ] unless ; inline
+
+ERROR: bad-class name ;
+
+: name>class ( name -- class )
+    {
+        { "Lower" letter-class }
+        { "Upper" LETTER-class }
+        { "Alpha" Letter-class }
+        { "ASCII" ascii-class }
+        { "Digit" digit-class }
+        { "Alnum" alpha-class }
+        { "Punct" punctuation-class }
+        { "Graph" java-printable-class }
+        { "Print" java-printable-class }
+        { "Blank" non-newline-blank-class }
+        { "Cntrl" control-character-class }
+        { "XDigit" hex-digit-class }
+        { "Space" java-blank-class }
+        ! TODO: unicode-character-class
+    } [ bad-class ] at-error ;
+
+: lookup-escape ( char -- ast )
      {
-        { CHAR: i [ case-insensitive ] }
-        { CHAR: d [ unix-lines ] }
-        { CHAR: m [ multiline ] }
-        { CHAR: n [ multiline ] }
-        { CHAR: r [ reversed-regexp ] }
-        { CHAR: s [ dotall ] }
-        { CHAR: u [ unicode-case ] }
-        { CHAR: x [ comments ] }
-        [ unknown-regexp-option ]
+        { CHAR: t [ CHAR: \t ] }
+        { CHAR: n [ CHAR: \n ] }
+        { CHAR: r [ CHAR: \r ] }
+        { CHAR: f [ HEX: c ] }
+        { CHAR: a [ HEX: 7 ] }
+        { CHAR: e [ HEX: 1b ] }
+        { CHAR: \\ [ CHAR: \\ ] }
+
+        { CHAR: w [ c-identifier-class <primitive-class> ] }
+        { CHAR: W [ c-identifier-class <primitive-class> <not-class> ] }
+        { CHAR: s [ java-blank-class <primitive-class> ] }
+        { CHAR: S [ java-blank-class <primitive-class> <not-class> ] }
+        { CHAR: d [ digit-class <primitive-class> ] }
+        { CHAR: D [ digit-class <primitive-class> <not-class> ] }
+
+        { CHAR: z [ end-of-input <tagged-epsilon> ] }
+        { CHAR: A [ beginning-of-input <tagged-epsilon> ] }
+        [ ]
      } case ;
  
+: options-assoc ( -- assoc )
+    H{
+        { CHAR: i case-insensitive }
+        { CHAR: d unix-lines }
+        { CHAR: m multiline }
+        { CHAR: n multiline }
+        { CHAR: r reversed-regexp }
+        { CHAR: s dotall }
+        { CHAR: u unicode-case }
+        { CHAR: x comments }
+    } ;
+
+: ch>option ( ch -- singleton )
+    options-assoc at ;
+
  : option>ch ( option -- string )
-    {
-        { case-insensitive [ CHAR: i ] }
-        { multiline [ CHAR: m ] }
-        { reversed-regexp [ CHAR: r ] }
-        { dotall [ CHAR: s ] }
-        [ unknown-regexp-option ]
-    } case ;
+    options-assoc value-at ;
  
-: toggle-option ( ch ? -- ) 
-    [ ch>option ] dip option boa push-stack ;
-
-: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
-
-: parse-options ( string -- )
-    "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
-
-ERROR: bad-special-group string ;
-
-DEFER: (parse-regexp)
-: nested-parse-regexp ( token ? -- )
-    [ push-stack (parse-regexp) pop-stack ] dip
-    [ <negation> ] when pop-stack new swap >>term push-stack ;
-
-! non-capturing groups
-: (parse-special-group) ( -- )
-    read1 {
-        { [ dup CHAR: # = ] ! comment
-            [ drop comment-group f nested-parse-regexp pop-stack drop ] }
-        { [ dup CHAR: : = ]
-            [ drop non-capture-group f nested-parse-regexp ] }
-        { [ dup CHAR: = = ]
-            [ drop lookahead f nested-parse-regexp ] }
-        { [ dup CHAR: ! = ]
-            [ drop lookahead t nested-parse-regexp ] }
-        { [ dup CHAR: > = ]
-            [ drop non-capture-group f nested-parse-regexp ] }
-        { [ dup CHAR: < = peek1 CHAR: = = and ]
-            [ drop drop1 lookbehind f nested-parse-regexp ] }
-        { [ dup CHAR: < = peek1 CHAR: ! = and ]
-            [ drop drop1 lookbehind t nested-parse-regexp ] }
-        [
-            ":)" read-until
-            [ swap prefix ] dip
-            {
-                { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
-                { CHAR: ) [ parse-options ] }
-                [ drop bad-special-group ]
-            } case
-        ]
-    } cond ;
-
-: handle-left-parenthesis ( -- )
-    peek1 CHAR: ? =
-    [ drop1 (parse-special-group) ]
-    [ capture-group f nested-parse-regexp ] if ;
-
-: handle-dot ( -- ) any-char push-stack ;
-: handle-pipe ( -- ) pipe push-stack ;
-: (handle-star) ( obj -- kleene-star )
-    peek1 {
-        { CHAR: + [ drop1 <possessive-kleene-star> ] }
-        { CHAR: ? [ drop1 <reluctant-kleene-star> ] }
-        [ drop <kleene-star> ]
-    } case ;
-: handle-star ( -- ) stack pop (handle-star) push-stack ;
-: handle-question ( -- )
-    stack pop peek1 {
-        { CHAR: + [ drop1 <possessive-question> ] }
-        { CHAR: ? [ drop1 <reluctant-question> ] }
-        [ drop epsilon 2array <alternation> ]
-    } case push-stack ;
-: handle-plus ( -- )
-    stack pop dup (handle-star)
-    2array <concatenation> push-stack ;
-
-ERROR: unmatched-brace ;
-: parse-repetition ( -- start finish ? )
-    "}" read-until [ unmatched-brace ] unless
-    [ "," split1 [ string>number ] bi@ ]
-    [ CHAR: , swap index >boolean ] bi ;
-
-: replicate/concatenate ( n obj -- obj' )
-    over zero? [ 2drop epsilon ]
-    [ <repetition> first|concatenation ] if ;
-
-: exactly-n ( n -- )
-    stack pop replicate/concatenate push-stack ;
-
-: at-least-n ( n -- )
-    stack pop
-    [ replicate/concatenate ] keep
-    <kleene-star> 2array <concatenation> push-stack ;
-
-: at-most-n ( n -- )
-    1+
-    stack pop
-    [ replicate/concatenate ] curry map <alternation> push-stack ;
-
-: from-m-to-n ( m n -- )
-    [a,b]
-    stack pop
-    [ replicate/concatenate ] curry map
-    <alternation> push-stack ;
-
-ERROR: invalid-range a b ;
-
-: handle-left-brace ( -- )
-    parse-repetition
-    [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
-    [
-        2dup and [ from-m-to-n ]
-        [ [ nip at-most-n ] [ at-least-n ] if* ] if
-    ] [ drop 0 max exactly-n ] if ;
-
-: handle-front-anchor ( -- ) beginning-of-line push-stack ;
-: handle-back-anchor ( -- ) end-of-line push-stack ;
-
-ERROR: bad-character-class obj ;
-ERROR: expected-posix-class ;
-
-: parse-posix-class ( -- obj )
-    read1 CHAR: { = [ expected-posix-class ] unless
-    "}" read-until [ bad-character-class ] unless
-    {
-        { "Lower" [ letter-class ] }
-        { "Upper" [ LETTER-class ] }
-        { "Alpha" [ Letter-class ] }
-        { "ASCII" [ ascii-class ] }
-        { "Digit" [ digit-class ] }
-        { "Alnum" [ alpha-class ] }
-        { "Punct" [ punctuation-class ] }
-        { "Graph" [ java-printable-class ] }
-        { "Print" [ java-printable-class ] }
-        { "Blank" [ non-newline-blank-class ] }
-        { "Cntrl" [ control-character-class ] }
-        { "XDigit" [ hex-digit-class ] }
-        { "Space" [ java-blank-class ] }
-        ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
-        [ bad-character-class ]
-    } case ;
+: parse-options ( on off -- options )
+    [ [ ch>option ] { } map-as ] bi@ <options> ;
  
-: parse-octal ( -- n ) 3 read oct> check-octal ;
-: parse-short-hex ( -- n ) 2 read hex> check-hex ;
-: parse-long-hex ( -- n ) 6 read hex> check-hex ;
-: parse-control-character ( -- n ) read1 ;
+: string>options ( string -- options )
+    "-" split1 parse-options ;
+ 
+: options>string ( options -- string )
+    [ on>> ] [ off>> ] bi
+    [ [ option>ch ] map ] bi@
+    [ "-" glue ] unless-empty
+    "" like ;
  
-ERROR: bad-escaped-literals seq ;
+! TODO: add syntax for various parenthized things,
+!       add greedy and nongreedy forms of matching
+! (once it's all implemented)
  
-: parse-til-E ( -- obj )
-    "\\E" read-until [ bad-escaped-literals ] unless ;
-    
-:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
-    parse-til-E
-    drop1
-    [ epsilon ] [
-        quot call [ <constant> ] V{ } map-as
-        first|concatenation
-    ] if-empty ; inline
+EBNF: parse-regexp
  
-: parse-escaped-literals ( -- obj )
-    [ ] (parse-escaped-literals) ;
+CharacterInBracket = !("}") Character
  
-: lower-case-literals ( -- obj )
-    [ >lower ] (parse-escaped-literals) ;
+QuotedCharacter = !("\\E") .
  
-: upper-case-literals ( -- obj )
-    [ >upper ] (parse-escaped-literals) ;
+Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
+       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
+       | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
+       | "u" Character:a Character:b Character:c Character:d
+            => [[ { a b c d } hex> ensure-number ]]
+       | "x" Character:a Character:b
+            => [[ { a b } hex> ensure-number ]]
+       | "0" Character:a Character:b Character:c
+            => [[ { a b c } oct> ensure-number ]]
+       | . => [[ lookup-escape ]]
  
-: parse-escaped ( -- obj )
-    read1
-    {
-        { CHAR: t [ CHAR: \t <constant> ] }
-        { CHAR: n [ CHAR: \n <constant> ] }
-        { CHAR: r [ CHAR: \r <constant> ] }
-        { CHAR: f [ HEX: c <constant> ] }
-        { CHAR: a [ HEX: 7 <constant> ] }
-        { CHAR: e [ HEX: 1b <constant> ] }
-
-        { CHAR: w [ c-identifier-class ] }
-        { CHAR: W [ c-identifier-class <negation> ] }
-        { CHAR: s [ java-blank-class ] }
-        { CHAR: S [ java-blank-class <negation> ] }
-        { CHAR: d [ digit-class ] }
-        { CHAR: D [ digit-class <negation> ] }
-
-        { CHAR: p [ parse-posix-class ] }
-        { CHAR: P [ parse-posix-class <negation> ] }
-        { CHAR: x [ parse-short-hex <constant> ] }
-        { CHAR: u [ parse-long-hex <constant> ] }
-        { CHAR: 0 [ parse-octal <constant> ] }
-        { CHAR: c [ parse-control-character ] }
-
-        { CHAR: Q [ parse-escaped-literals ] }
-
-        ! { CHAR: b [ word-boundary-class ] }
-        ! { CHAR: B [ word-boundary-class <negation> ] }
-        ! { CHAR: A [ handle-beginning-of-input ] }
-        ! { CHAR: z [ handle-end-of-input ] }
-
-        ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
-
-        ! m//g mode
-        ! { CHAR: G [ end of previous match ] }
-
-        ! Group capture
-        ! { CHAR: 1 [ CHAR: 1 <constant> ] }
-        ! { CHAR: 2 [ CHAR: 2 <constant> ] }
-        ! { CHAR: 3 [ CHAR: 3 <constant> ] }
-        ! { CHAR: 4 [ CHAR: 4 <constant> ] }
-        ! { CHAR: 5 [ CHAR: 5 <constant> ] }
-        ! { CHAR: 6 [ CHAR: 6 <constant> ] }
-        ! { CHAR: 7 [ CHAR: 7 <constant> ] }
-        ! { CHAR: 8 [ CHAR: 8 <constant> ] }
-        ! { CHAR: 9 [ CHAR: 9 <constant> ] }
-
-        ! Perl extensions
-        ! can't do \l and \u because \u is already a 4-hex
-        { CHAR: L [ lower-case-literals ] }
-        { CHAR: U [ upper-case-literals ] }
-
-        [ <constant> ]
-    } case ;
+EscapeSequence = "\\" Escape:e => [[ e ]]
  
-: handle-escape ( -- ) parse-escaped push-stack ;
-
-: handle-dash ( vector -- vector' )
-    H{ { dash CHAR: - } } substitute ;
-
-: character-class>alternation ( seq -- alternation )
-    [ dup number? [ <constant> ] when ] map first|alternation ;
-
-: handle-caret ( vector -- vector' )
-    dup [ length 2 >= ] [ first caret eq? ] bi and [
-        rest-slice character-class>alternation <negation>
-    ] [
-        character-class>alternation
-    ] if ;
-
-: make-character-class ( -- character-class )
-    [ beginning-of-character-class swap cut-stack ] change-whole-stack
-    handle-dash handle-caret ;
-
-: apply-dash ( -- )
-    stack [ pop3 nip <character-class-range> ] keep push ;
-
-: apply-dash? ( -- ? )
-    stack dup length 3 >=
-    [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
-
-ERROR: empty-negated-character-class ;
-DEFER: handle-left-bracket
-: (parse-character-class) ( -- )
-    read1 [ empty-negated-character-class ] unless* {
-        { CHAR: [ [ handle-left-bracket t ] }
-        { CHAR: ] [ make-character-class push-stack f ] }
-        { CHAR: - [ dash push-stack t ] }
-        { CHAR: \ [ parse-escaped push-stack t ] }
-        [ push-stack apply-dash? [ apply-dash ] when t ]
-    } case
-    [ (parse-character-class) ] when ;
-
-: push-constant ( ch -- ) <constant> push-stack ;
-
-: parse-character-class-second ( -- )
-    read1 {
-        { CHAR: [ [ CHAR: [ push-constant ] }
-        { CHAR: ] [ CHAR: ] push-constant ] }
-        { CHAR: - [ CHAR: - push-constant ] }
-        [ push1 ]
-    } case ;
+Character = EscapeSequence
+          | "$" => [[ $ <tagged-epsilon> ]]
+          | "^" => [[ ^ <tagged-epsilon> ]]
+          | . ?[ allowed-char? ]?
  
-: parse-character-class-first ( -- )
-    read1 {
-        { CHAR: ^ [ caret push-stack parse-character-class-second ] }
-        { CHAR: [ [ CHAR: [ push-constant ] }
-        { CHAR: ] [ CHAR: ] push-constant ] }
-        { CHAR: - [ CHAR: - push-constant ] }
-        [ push1 ]
-    } case ;
+AnyRangeCharacter = EscapeSequence | .
  
-: handle-left-bracket ( -- )
-    beginning-of-character-class push-stack
-    parse-character-class-first (parse-character-class) ;
+RangeCharacter = !("]") AnyRangeCharacter
  
-: finish-regexp-parse ( stack -- obj )
-    { pipe } split
-    [ first|concatenation ] map first|alternation ;
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+      | RangeCharacter
  
-: handle-right-parenthesis ( -- )
-    stack dup [ parentheses-group "members" word-prop member? ] find-last
-    -rot cut rest
-    [ [ push ] keep current-regexp get (>>stack) ]
-    [ finish-regexp-parse push-stack ] bi* ;
+StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+           | AnyRangeCharacter
  
-: parse-regexp-token ( token -- ? )
-    {
-        { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning?
-        { CHAR: ) [ handle-right-parenthesis f ] }
-        { CHAR: . [ handle-dot t ] }
-        { CHAR: | [ handle-pipe t ] }
-        { CHAR: ? [ handle-question t ] }
-        { CHAR: * [ handle-star t ] }
-        { CHAR: + [ handle-plus t ] }
-        { CHAR: { [ handle-left-brace t ] }
-        { CHAR: [ [ handle-left-bracket t ] }
-        { CHAR: \ [ handle-escape t ] }
-        [
-            dup CHAR: $ = peek1 f = and
-            [ drop handle-back-anchor f ]
-            [ push-constant t ] if
-        ]
-    } case ;
+Ranges = StartRange:s Range*:r => [[ r s prefix ]]
+
+CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+
+Options = [idmsux]*
+
+Parenthized = "?:" Alternation:a => [[ a ]]
+            | "?" Options:on "-"? Options:off ":" Alternation:a
+                => [[ a on off parse-options <with-options> ]]
+            | "?#" [^)]* => [[ f ]]
+            | "?~" Alternation:a => [[ a <negation> ]]
+            | "?=" Alternation:a => [[ a <lookahead> ]]
+            | "?!" Alternation:a => [[ a <negation> <lookahead> ]]
+            | "?<=" Alternation:a => [[ a <lookbehind> ]]
+            | "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
+            | Alternation
+
+Element = "(" Parenthized:p ")" => [[ p ]]
+        | "[" CharClass:r "]" => [[ r ]]
+        | ".":d => [[ any-char <primitive-class> ]]
+        | Character
+
+Number = (!(","|"}").)* => [[ string>number ensure-number ]]
+
+Times = "," Number:n "}" => [[ 0 n <from-to> ]]
+      | Number:n ",}" => [[ n <at-least> ]]
+      | Number:n "}" => [[ n n <from-to> ]]
+      | "}" => [[ bad-number ]]
+      | Number:n "," Number:m "}" => [[ n m <from-to> ]]
+
+Repeated = Element:e "{" Times:t => [[ e t <times> ]]
+         | Element:e "*+" => [[ e <possessive-star> ]]
+         | Element:e "++" => [[ e <possessive-plus> ]]
+         | Element:e "?" => [[ e <maybe> ]]
+         | Element:e "*" => [[ e <star> ]]
+         | Element:e "+" => [[ e <plus> ]]
+         | Element
+
+Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
+
+Alternation = Concatenation:c ("|" Concatenation)*:a
+                => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
+
+End = !(.)
  
-: (parse-regexp) ( -- )
-    read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
-
-: parse-regexp-beginning ( -- )
-    peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
-
-: parse-regexp ( regexp -- )
-    dup current-regexp [
-        raw>> [
-            <string-reader> [
-                parse-regexp-beginning (parse-regexp)
-            ] with-input-stream
-        ] unless-empty
-        current-regexp get [ finish-regexp-parse ] change-stack
-        dup stack>> >>parse-tree drop
-    ] with-variable ;
+Main = Alternation End
+;EBNF
diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor

index 378ae503ce7257ce331f1b412a1b05121b2c6d1f..9d3d86fa13e797f8da0860707a306a31e33a6f3c 100644 (file)
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -1,8 +1,64 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel strings help.markup help.syntax regexp.backend ;
+USING: kernel strings help.markup help.syntax regexp.matchers ;
  IN: regexp
  
+ABOUT: "regexp"
+
+ARTICLE: "regexp" "Regular expressions"
+"The " { $vocab-link "regexp" } " vocabulary provides word for creating and using regular expressions."
+{ $subsection { "regexp" "syntax" } }
+{ $subsection { "regexp" "construction" } }
+{ $vocab-subsection "regexp.combinators" "Regular expression combinators" }
+{ $subsection { "regexp" "operations" } }
+{ $subsection regexp }
+{ $subsection { "regexp" "theory" } } ;
+
+ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
+"Words which are useful for creating regular expressions:"
+{ $subsection POSTPONE: R/ }
+{ $subsection <regexp> } 
+{ $subsection <optioned-regexp> }
+{ $heading "See also" }
+{ $vocab-link "regexp.combinators" } ;
+
+ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
+"Regexp syntax is largely compatible with Perl, Java and extended POSTFIX regexps, but not completely." $nl
+"A new addition is the inclusion of a negation operator, with the syntax " { $snippet "(?~foo)" } " to match everything that does not match " { $snippet "foo" } "." $nl
+"One missing feature is backreferences. This is because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } ". You can create a new regular expression to match a particular string using " { $vocab-link "regexp.combinators" } " and group capture is available to extract parts of a regular expression match." $nl
+"A distinction from Perl is that " { $snippet "\\G" } ", which references the previous match, is not included. This is because that sequence is inherently stateful, and Factor regexps don't hold state." $nl
+"Additionally, none of the operations which embed code into a regexp are supported, as this would require the inclusion of the Factor parser and compiler in any application which wants to expose regexps to the user. None of the casing operations are included, for simplicity." ; ! Also describe syntax, from the beginning
+
+ARTICLE: { "regexp" "theory" } "The theory of regular expressions"
+"Far from being just a practical tool invented by Unix hackers, regular expressions were studied formally before computer programs were written to process them." $nl
+"A regular language is a set of strings that is matched by a regular expression, which is defined to have characters and the empty string, along with the operations concatenation, disjunction and Kleene star. Another way to define the class of regular languages is as the class of languages which can be recognized with constant space overhead, ie with a DFA. These two definitions are provably equivalent." $nl
+"One basic result in the theory of regular language is that the complement of a regular language is regular. In other words, for any regular expression, there exists another regular expression which matches exactly the strings that the first one doesn't match." $nl
+"This implies, by DeMorgan's law, that, if you have two regular languages, their intersection is also regular. That is, for any two regular expressions, there exists a regular expression which matches strings that match both inputs." $nl
+"Traditionally, regular expressions on computer support an additional operation: backreferences. For example, the Perl regexp " { $snippet "/(.*)$1/" } " matches a string repated twice. If a backreference refers to a string with a predetermined maximum length, then the resulting language is still regular." $nl
+"But, if not, the language is not regular. There is strong evidence that there is no efficient way to parse with backreferences in the general case. Perl uses a naive backtracking algorithm which has pathological behavior in some cases, taking exponential time to match even if backreferences aren't used. Additionally, expressions with backreferences don't have the properties with negation and intersection described above." $nl
+"The Factor regular expression engine was built with the design decision to support negation and intersection at the expense of backreferences. This lets us have a guaranteed linear-time matching algorithm. Systems like Ragel and Lex also use this algorithm, but in the Factor regular expression engine, all other features of regexps are still present." ;
+
+ARTICLE: { "regexp" "operations" } "Matching operations with regular expressions"
+{ $subsection matches? }
+{ $subsection match-slice }
+{ $subsection re-split1 }
+{ $subsection re-split }
+{ $subsection re-replace }
+{ $subsection all-matches }
+{ $subsection count-matches }
+{ $subsection re-replace } ;
+
  HELP: <regexp>
  { $values { "string" string } { "regexp" regexp } }
-{ $description "Compiles a regular expression into a DFA and returns this object.  Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ;
+{ $description "Creates a regular expression object, given a string in regular expression syntax. When it is first used for matching, a DFA is compiled, and this DFA is stored for reuse so it is only compiled once." } ;
+
+HELP: <optioned-regexp>
+{ $values { "string" string } { "options" string } { "regexp" regexp } }
+{ $description "Given a string in regular expression syntax, and a string of options, creates a regular expression object. When it is first used for matching, a DFA is compiled, and this DFA is stored for reuse so it is only compiled once." } ;
+
+HELP: R/
+{ $syntax "R/ foo.*|[a-zA-Z]bar/i" }
+{ $description "Literal syntax for a regular expression. When this syntax is used, the DFA is compiled at compile-time, rather than on first use." } ;
+
+HELP: regexp
+{ $class-description "The class of regular expressions. To construct these, see " { $link { "regexp" "construction" } } "." } ;
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index 1cd9a2392efc87e1646eb52b17ec24fda88b67e1..9d94c4126ba424fd8c0f13626c5d848004782ca9 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -1,8 +1,10 @@
  USING: regexp tools.test kernel sequences regexp.parser
-regexp.traversal eval strings multiline ;
+regexp.traversal eval strings multiline accessors regexp.matchers ;
  IN: regexp-tests
  
  \ <regexp> must-infer
+\ compile-regexp must-infer
+\ compile-dfa-quot must-infer
  \ matches? must-infer
  
  [ f ] [ "b" "a*" <regexp> matches? ] unit-test
@@ -44,9 +46,9 @@ IN: regexp-tests
  ! Dotall mode -- when on, . matches newlines.
  ! Off by default.
  [ f ] [ "\n" "." <regexp> matches? ] unit-test
-[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
+! [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
  [ t ] [ "\n" R/ ./s matches? ] unit-test
-[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
+! [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
  
  [ f ] [ "" ".+" <regexp> matches? ] unit-test
  [ t ] [ "a" ".+" <regexp> matches? ] unit-test
@@ -76,8 +78,6 @@ IN: regexp-tests
  [ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
  [ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
  
-/*
-! FIXME
  [ f ] [ "" "(a)" <regexp> matches? ] unit-test
  [ t ] [ "a" "(a)" <regexp> matches? ] unit-test
  [ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
@@ -85,7 +85,6 @@ IN: regexp-tests
  
  [ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
  [ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
-*/
  
  [ f ] [ "" "a{1}" <regexp> matches? ] unit-test
  [ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
@@ -168,12 +167,9 @@ IN: regexp-tests
  [ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
  [ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
  
-/*
-! FIXME
  [ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
  [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
  [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
-*/
  
  [ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
  [ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
@@ -195,8 +191,8 @@ IN: regexp-tests
  [ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
  [ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
  [ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
-[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
-[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
+[ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
+[ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
  
  [ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
  [ f ] [ "b" "a+b" <regexp> matches? ] unit-test
@@ -214,8 +210,8 @@ IN: regexp-tests
  [ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
  [ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
  
-[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
-[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
+[ 3 ] [ "aaacb" "a*" <regexp> match-index-head ] unit-test
+[ 2 ] [ "aaacb" "aa?" <regexp> match-index-head ] unit-test
  
  [ t ] [ "aaa" R/ AAA/i matches? ] unit-test
  [ f ] [ "aax" R/ AAA/i matches? ] unit-test
@@ -226,6 +222,7 @@ IN: regexp-tests
  [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
  [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
  
+/*
  [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
  [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
  [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
@@ -235,6 +232,7 @@ IN: regexp-tests
  [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
  [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
  [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
+*/
  
  [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
  [ t ] [ "A" R/ [a-z]/i matches? ] unit-test
@@ -242,9 +240,11 @@ IN: regexp-tests
  [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
  [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
  
-[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
-[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
-! [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test ! FIXME
+[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
+[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
+
+[ t ] [ "xabc" R/ abc/ <reverse-matcher> match-index-head >boolean ] unit-test
+[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-head >boolean ] unit-test
  
  [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
  [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@@ -253,8 +253,6 @@ IN: regexp-tests
  [ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test
  [ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test
  
-/*
-! FIXME
  [ ] [
      "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
      <regexp> drop
@@ -271,14 +269,13 @@ IN: regexp-tests
  
  [ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
  
-[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
-[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
+[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
+[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
  
-[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
-[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
+[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
+[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
  
-[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test
-*/
+[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> match-head >string ] unit-test
  
  ! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
  
@@ -309,31 +306,49 @@ IN: regexp-tests
  [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
  
  /*
-! FIXME
-[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
-[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
+[ f ] [ "ab" "a(?!b)" <regexp> match-head ] unit-test
+[ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test
  [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
  [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
-[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
-[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
-[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
+[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test
+[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> match-head >string ] unit-test
+[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> match-head >string ] unit-test
  
-[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
-[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
+[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-index-head ] unit-test
+[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-index-head ] unit-test
  */
  
  ! Bug in parsing word
  [ t ] [ "a" R' a' matches? ] unit-test
  
-! Convert to lowercase until E
-[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
-[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
-
-! Convert to uppercase until E
-[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
-[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
-
-! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
+! Testing negation
+[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a)/ matches? ] unit-test
+
+[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
+
+! Intersecting classes
+[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
+[ t ] [ "ac" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ac" R/ ac|[a-z]b/ matches? ] unit-test
+[ t ] [ "ac" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "πb" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ f ] [ "πc" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ f ] [ "Ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+
+[ t ] [ "aaaa" R/ .*a./ matches? ] unit-test
+
+! DFA is compiled when needed, or when literal
+[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
+[ t ] [ R/ foo/ dfa>> >boolean ] unit-test
  
  ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
  ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
@@ -378,20 +393,20 @@ IN: regexp-tests
  ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
  ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
  
-! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
  
-! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
-! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
+! [ f ] [ "foobxr" "foo\\z" <regexp> match-index-head ] unit-test
+! [ 3 ] [ "foo" "foo\\z" <regexp> match-index-head ] unit-test
  
  ! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
  ! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
  ! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
  ! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
  
-! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
+! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
  ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
  ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
  ! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
@@ -399,18 +414,18 @@ IN: regexp-tests
  ! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
  
  ! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
-! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test
+! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
  ! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
  ! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
  ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
  ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
  
-! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
-! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
+! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
+! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
  ! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
  ! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
-! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
-! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
+! [ 3 ] [ "aacb" "aa?c" <regexp> match-index-head ] unit-test
+! [ 3 ] [ "aacb" "aa??c" <regexp> match-index-head ] unit-test
  
  ! "ab" "a(?=b*)" <regexp> match
  ! "abbbbbc" "a(?=b*c)" <regexp> match
diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor

index 11d257b6b256d17de69b325ffc177727ac98f541..0502cb4d4b765b9176fa422bea6bbdc2d37d5ce3 100644 (file)
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@@ -1,86 +1,59 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors combinators kernel math sequences strings sets
  assocs prettyprint.backend prettyprint.custom make lexer
-namespaces parser arrays fry regexp.backend regexp.utils
+namespaces parser arrays fry locals regexp.minimize
  regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting ;
+regexp.transition-tables splitting sorting regexp.ast
+regexp.negation regexp.matchers regexp.compiler ;
  IN: regexp
  
-: default-regexp ( string -- regexp )
-    regexp new
-        swap >>raw
-        <transition-table> >>nfa-table
-        <transition-table> >>dfa-table
-        <transition-table> >>minimized-table
-        H{ } clone >>nfa-traversal-flags
-        H{ } clone >>dfa-traversal-flags
-        H{ } clone >>options
-        H{ } clone >>matchers
-        reset-regexp ;
-
-: construct-regexp ( regexp -- regexp' )
-    {
-        [ parse-regexp ]
-        [ construct-nfa ]
-        [ construct-dfa ]
-        [ ]
-    } cleave ;
-
-: (match) ( string regexp -- dfa-traverser )
-    <dfa-traverser> do-match ; inline
-
-: match ( string regexp -- slice/f )
-    (match) return-match ;
-
-: match* ( string regexp -- slice/f captured-groups )
-    (match) [ return-match ] [ captured-groups>> ] bi ;
+TUPLE: regexp
+    { raw read-only }
+    { parse-tree read-only }
+    { options read-only }
+    dfa reverse-dfa dfa-quot ;
  
-: matches? ( string regexp -- ? )
-    dupd match
-    [ [ length ] bi@ = ] [ drop f ] if* ;
+: make-regexp ( string ast -- regexp )
+    f f <options> f f f regexp boa ; foldable
+    ! Foldable because, when the dfa slot is set,
+    ! it'll be set to the same thing regardless of who sets it
  
-: match-head ( string regexp -- end/f ) match [ length ] [ f ] if* ;
+: <optioned-regexp> ( string options -- regexp )
+    [ dup parse-regexp ] [ string>options ] bi*
+    f f f regexp boa ;
  
-: match-at ( string m regexp -- n/f finished? )
-    [
-        2dup swap length > [ 2drop f f ] [ tail-slice t ] if
-    ] dip swap [ match-head f ] [ 2drop f t ] if ;
-
-: match-range ( string m regexp -- a/f b/f )
-    3dup match-at over [
-        drop nip rot drop dupd +
-    ] [
-        [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
-    ] if ;
+: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
  
-: first-match ( string regexp -- slice/f )
-    dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
+TUPLE: reverse-matcher regexp ;
+C: <reverse-matcher> reverse-matcher
  
-: re-cut ( string regexp -- end/f start )
-    dupd first-match
-    [ split1-slice swap ] [ "" like f swap ] if* ;
+<PRIVATE
  
-: (re-split) ( string regexp -- )
-    over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
+: get-ast ( regexp -- ast )
+    [ parse-tree>> ] [ options>> ] bi <with-options> ;
  
-: re-split ( string regexp -- seq )
-    [ (re-split) ] { } make ;
+: compile-regexp ( regexp -- regexp )
+    dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ;
  
-: re-replace ( string regexp replacement -- result )
-    [ re-split ] dip join ;
+: compile-dfa-quot ( regexp -- regexp )
+    dup '[ [ _ compile-regexp dfa>> dfa>quotation ] unless* ] change-dfa-quot ;
  
-: next-match ( string regexp -- end/f match/f )
-    dupd first-match dup
-    [ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
+: <reversed-option> ( ast -- reversed )
+    "r" string>options <with-options> ;
  
-: all-matches ( string regexp -- seq )
-    [ dup ] swap '[ _ next-match ] produce nip harvest ;
+: compile-reverse ( regexp -- regexp )
+    dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
  
-: count-matches ( string regexp -- n )
-    all-matches length ;
+M: regexp match-index-from ( string regexp -- index/f )
+    dup dfa-quot>>
+    [ <quot-matcher> ]
+    [ compile-regexp dfa>> <dfa-matcher> ] ?if
+    match-index-from ;
  
-<PRIVATE
+M: reverse-matcher match-index-from ( string regexp -- index/f )
+    [ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
+    <dfa-traverser> do-match match-index>> ;
  
  : find-regexp-syntax ( string -- prefix suffix )
      {
@@ -97,28 +70,19 @@ IN: regexp
          { "R| "  "|"  }
      } swap [ subseq? not nip ] curry assoc-find drop ;
  
-: string>options ( string -- options )
-    [ ch>option dup ] H{ } map>assoc ;
-
-: options>string ( options -- string )
-    keys [ option>ch ] map natural-sort >string ;
-
-PRIVATE>
-
-: <optioned-regexp> ( string option-string -- regexp )
-    [ default-regexp ] [ string>options ] bi* >>options
-    construct-regexp ;
-
-: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
+: take-until ( end lexer -- string )
+    dup skip-blank [
+        [ index-from ] 2keep
+        [ swapd subseq ]
+        [ 2drop 1+ ] 3bi
+    ] change-lexer-column ;
  
-<PRIVATE
+: parse-noblank-token ( lexer -- str/f )
+    dup still-parsing-line? [ (parse-token) ] [ drop f ] if ;
  
  : parsing-regexp ( accum end -- accum )
-    lexer get dup skip-blank
-    [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
-    lexer get dup still-parsing-line?
-    [ (parse-token) ] [ drop f ] if
-    <optioned-regexp> parsed ;
+    lexer get [ take-until ] [ parse-noblank-token ] bi
+    <optioned-regexp> compile-dfa-quot parsed ;
  
  PRIVATE>
  
diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor

index e5c31a54e0e40f4260e439030410069e36b99bc2..2b0a5c2bcc473f77de2ae11ca9d848639275eaea 100644 (file)
--- a/basis/regexp/transition-tables/transition-tables.factor
+++ b/basis/regexp/transition-tables/transition-tables.factor
@@ -1,32 +1,9 @@
  ! Copyright (C) 2008 Doug Coleman.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors arrays assocs fry hashtables kernel sequences
-vectors regexp.utils ;
+vectors locals ;
  IN: regexp.transition-tables
  
-TUPLE: transition from to obj ;
-TUPLE: literal-transition < transition ;
-TUPLE: class-transition < transition ;
-TUPLE: default-transition < transition ;
-
-TUPLE: literal obj ;
-TUPLE: class obj ;
-TUPLE: default ;
-: make-transition ( from to obj class -- obj )
-    new
-        swap >>obj
-        swap >>to
-        swap >>from ;
-
-: <literal-transition> ( from to obj -- transition )
-    literal-transition make-transition ;
-
-: <class-transition> ( from to obj -- transition )
-    class-transition make-transition ;
-
-: <default-transition> ( from to -- transition )
-    t default-transition make-transition ;
-
  TUPLE: transition-table transitions start-state final-states ;
  
  : <transition-table> ( -- transition-table )
@@ -37,12 +14,11 @@ TUPLE: transition-table transitions start-state final-states ;
  : maybe-initialize-key ( key hashtable -- )
      2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
  
-: set-transition ( transition hash -- )
-    #! set the state as a key
-    2dup [ to>> ] dip maybe-initialize-key
-    [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
-    2dup at* [ 2nip insert-at ]
-    [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
+:: set-transition ( from to obj hash -- )
+    to hash maybe-initialize-key
+    from hash at
+    [ [ to obj ] dip push-at ]
+    [ to 1vector obj associate from hash set-at ] if* ;
  
-: add-transition ( transition transition-table -- )
+: add-transition ( from to obj transition-table -- )
      transitions>> set-transition ;
diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor

index 104a6c2ce1c2159445e2ba8175d55520e5e295b1..b890ca7e122e15f635283a9b58d6aaac50b657a3 100644 (file)
--- a/basis/regexp/traversal/traversal.factor
+++ b/basis/regexp/traversal/traversal.factor
@@ -1,178 +1,56 @@
  ! Copyright (C) 2008 Doug Coleman.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors assocs combinators kernel math
-quotations sequences regexp.parser regexp.classes fry arrays
-combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
+quotations sequences regexp.classes fry arrays regexp.matchers
+combinators.short-circuit prettyprint regexp.nfa ;
  IN: regexp.traversal
  
  TUPLE: dfa-traverser
      dfa-table
-    traversal-flags
-    traverse-forward
-    lookahead-counters
-    lookbehind-counters
-    capture-counters
-    captured-groups
-    capture-group-index
-    last-state current-state
+    current-state
      text
-    match-failed?
-    start-index current-index
-    matches ;
+    current-index
+    match-index ;
  
-: <dfa-traverser> ( text regexp -- match )
-    [ dfa-table>> ] [ dfa-traversal-flags>> ] bi
+: <dfa-traverser> ( start-index text dfa -- match )
      dfa-traverser new
-        swap >>traversal-flags
          swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
          swap >>text
-        t >>traverse-forward
-        0 >>start-index
-        0 >>current-index
-        0 >>capture-group-index
-        V{ } clone >>matches
-        V{ } clone >>capture-counters
-        V{ } clone >>lookbehind-counters
-        V{ } clone >>lookahead-counters
-        H{ } clone >>captured-groups ;
+        swap >>current-index ;
  
  : final-state? ( dfa-traverser -- ? )
      [ current-state>> ]
      [ dfa-table>> final-states>> ] bi key? ;
  
-: beginning-of-text? ( dfa-traverser -- ? )
-    current-index>> 0 <= ; inline
-
  : end-of-text? ( dfa-traverser -- ? )
      [ current-index>> ] [ text>> length ] bi >= ; inline
  
  : text-finished? ( dfa-traverser -- ? )
      {
-        [ current-state>> empty? ]
+        [ current-state>> not ]
          [ end-of-text? ]
-        [ match-failed?>> ]
      } 1|| ;
  
-: save-final-state ( dfa-straverser -- )
-    [ current-index>> ] [ matches>> ] bi push ;
+: save-final-state ( dfa-traverser -- dfa-traverser )
+    dup current-index>> >>match-index ;
  
  : match-done? ( dfa-traverser -- ? )
-    dup final-state? [
-        dup save-final-state
-    ] when text-finished? ;
-
-: previous-text-character ( dfa-traverser -- ch )
-    [ text>> ] [ current-index>> 1- ] bi nth ;
-
-: current-text-character ( dfa-traverser -- ch )
-    [ text>> ] [ current-index>> ] bi nth ;
-
-: next-text-character ( dfa-traverser -- ch )
-    [ text>> ] [ current-index>> 1+ ] bi nth ;
-
-GENERIC: flag-action ( dfa-traverser flag -- )
-
-
-M: beginning-of-input flag-action ( dfa-traverser flag -- )
-    drop
-    dup beginning-of-text? [ t >>match-failed? ] unless drop ;
-
-M: end-of-input flag-action ( dfa-traverser flag -- )
-    drop
-    dup end-of-text? [ t >>match-failed? ] unless drop ;
-
-
-M: beginning-of-line flag-action ( dfa-traverser flag -- )
-    drop
-    dup {
-        [ beginning-of-text? ]
-        [ previous-text-character terminator-class class-member? ]
-    } 1|| [ t >>match-failed? ] unless drop ;
-
-M: end-of-line flag-action ( dfa-traverser flag -- )
-    drop
-    dup {
-        [ end-of-text? ]
-        [ next-text-character terminator-class class-member? ]
-    } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: word-boundary flag-action ( dfa-traverser flag -- )
-    drop
-    dup {
-        [ end-of-text? ]
-        [ current-text-character terminator-class class-member? ]
-    } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: lookahead-on flag-action ( dfa-traverser flag -- )
-    drop
-    lookahead-counters>> 0 swap push ;
-
-M: lookahead-off flag-action ( dfa-traverser flag -- )
-    drop
-    dup lookahead-counters>>
-    [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
-
-M: lookbehind-on flag-action ( dfa-traverser flag -- )
-    drop
-    f >>traverse-forward
-    [ 2 - ] change-current-index
-    lookbehind-counters>> 0 swap push ;
-
-M: lookbehind-off flag-action ( dfa-traverser flag -- )
-    drop
-    t >>traverse-forward
-    dup lookbehind-counters>>
-    [ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ;
-
-M: capture-group-on flag-action ( dfa-traverser flag -- )
-    drop
-    [ current-index>> 0 2array ]
-    [ capture-counters>> ] bi push ;
-
-M: capture-group-off flag-action ( dfa-traverser flag -- )
-    drop
-    dup capture-counters>> empty? [
-        drop
-    ] [
-        {
-            [ capture-counters>> pop first2 dupd + ]
-            [ text>> <slice> ]
-            [ [ 1+ ] change-capture-group-index capture-group-index>> ]
-            [ captured-groups>> set-at ]
-        } cleave
-    ] if ;
-
-: process-flags ( dfa-traverser -- )
-    [ [ 1+ ] map ] change-lookahead-counters
-    [ [ 1+ ] map ] change-lookbehind-counters
-    [ [ first2 1+ 2array ] map ] change-capture-counters
-    ! dup current-state>> .
-    dup [ current-state>> ] [ traversal-flags>> ] bi
-    at [ flag-action ] with each ;
+    dup final-state? [ save-final-state ] when text-finished? ;
  
  : increment-state ( dfa-traverser state -- dfa-traverser )
-    [
-        dup traverse-forward>>
-        [ [ 1+ ] change-current-index ]
-        [ [ 1- ] change-current-index ] if
-        dup current-state>> >>last-state
-    ] [ first ] bi* >>current-state ;
+    >>current-state
+    [ 1 + ] change-current-index ;
  
  : match-literal ( transition from-state table -- to-state/f )
      transitions>> at at ;
  
  : match-class ( transition from-state table -- to-state/f )
      transitions>> at* [
-        [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if
+        swap '[ drop _ swap class-member? ] assoc-find spin ?
      ] [ drop ] if ;
  
-: match-default ( transition from-state table -- to-state/f )
-    [ drop ] 2dip transitions>> at t swap at ;
-
  : match-transition ( obj from-state dfa -- to-state/f )
-    { [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
+    { [ match-literal ] [ match-class ] } 3|| ;
  
  : setup-match ( match -- obj state dfa-table )
      [ [ current-index>> ] [ text>> ] bi nth ]
@@ -180,16 +58,12 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
      [ dfa-table>> ] tri ;
  
  : do-match ( dfa-traverser -- dfa-traverser )
-    dup process-flags
      dup match-done? [
          dup setup-match match-transition
          [ increment-state do-match ] when*
      ] unless ;
  
-: return-match ( dfa-traverser -- slice/f )
-    dup matches>>
-    [ drop f ]
-    [
-        [ [ text>> ] [ start-index>> ] bi ]
-        [ peek ] bi* rot <slice>
-    ] if-empty ;
+TUPLE: dfa-matcher dfa ;
+C: <dfa-matcher> dfa-matcher
+M: dfa-matcher match-index-from
+    dfa>> <dfa-traverser> do-match match-index>> ;
diff --git a/basis/regexp/utils/utils-tests.factor b/basis/regexp/utils/utils-tests.factor

deleted file mode 100644 (file)

index d048ad4..0000000
--- a/basis/regexp/utils/utils-tests.factor
+++ /dev/null
@@ -1,4 +0,0 @@
-USING: regexp.utils tools.test ;
-IN: regexp.utils.tests
-
-[ [ ] [ ] while-changes ] must-infer
diff --git a/basis/regexp/utils/utils.factor b/basis/regexp/utils/utils.factor

deleted file mode 100644 (file)

index af1b2fa..0000000
--- a/basis/regexp/utils/utils.factor
+++ /dev/null
@@ -1,64 +0,0 @@
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs io kernel math math.order
-namespaces regexp.backend sequences unicode.categories
-math.ranges fry combinators.short-circuit vectors ;
-IN: regexp.utils
-
-: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
-    [ [ dup slip ] dip pick over call ] dip dupd =
-    [ 3drop ] [ (while-changes) ] if ; inline recursive
-
-: while-changes ( obj quot pred -- obj' )
-    pick over call (while-changes) ; inline
-
-: assoc-with ( param assoc quot -- assoc curry )
-    swapd [ [ -rot ] dip call ] 2curry ; inline
-
-: insert-at ( value key hash -- )
-    2dup at* [
-        2nip push
-    ] [
-        drop
-        [ dup vector? [ 1vector ] unless ] 2dip set-at
-    ] if ;
-
-: ?insert-at ( value key hash/f -- hash )
-    [ H{ } clone ] unless* [ insert-at ] keep ;
-
-ERROR: bad-octal number ;
-ERROR: bad-hex number ;
-: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
-: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
-
-: ascii? ( n -- ? ) 0 HEX: 7f between? ;
-: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
-: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
-
-: hex-digit? ( n -- ? )
-    [
-        [ decimal-digit? ]
-        [ CHAR: a CHAR: f between? ]
-        [ CHAR: A CHAR: F between? ]
-    ] 1|| ;
-
-: control-char? ( n -- ? )
-    [
-        [ 0 HEX: 1f between? ]
-        [ HEX: 7f = ]
-    ] 1|| ;
-
-: punct? ( n -- ? )
-    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
-
-: c-identifier-char? ( ch -- ? )
-    [ [ alpha? ] [ CHAR: _ = ] ] 1|| ;
-
-: java-blank? ( n -- ? )
-    {
-        CHAR: \s CHAR: \t CHAR: \n
-        HEX: b HEX: 7 CHAR: \r
-    } member? ;
-
-: java-printable? ( n -- ? )
-    [ [ alpha? ] [ punct? ] ] 1|| ;
diff --git a/basis/xmode/catalog/catalog.factor b/basis/xmode/catalog/catalog.factor

index 7a935d31a424b6619f9219a5302c30fe29784d92..894ec264abb4ed02eed51f130aea3fcdc5686194 100644 (file)
--- a/basis/xmode/catalog/catalog.factor
+++ b/basis/xmode/catalog/catalog.factor
@@ -1,13 +1,14 @@
  USING: xmode.loader xmode.utilities xmode.rules namespaces
  strings splitting assocs sequences kernel io.files xml memoize
-words globs combinators io.encodings.utf8 sorting accessors xml.data ;
+words globs combinators io.encodings.utf8 sorting accessors xml.data
+xml.traversal xml.syntax ;
  IN: xmode.catalog
  
  TUPLE: mode file file-name-glob first-line-glob ;
  
-<TAGS: parse-mode-tag ( modes tag -- )
+TAGS: parse-mode-tag ( modes tag -- )
  
-TAG: MODE
+TAG: MODE parse-mode-tag
      dup "NAME" attr [
          mode new {
              { "FILE" f (>>file) }
@@ -17,11 +18,9 @@ TAG: MODE
      ] dip
      rot set-at ;
  
-TAGS>
-
  : parse-modes-tag ( tag -- modes )
      H{ } clone [
-        swap child-tags [ parse-mode-tag ] with each
+        swap children-tags [ parse-mode-tag ] with each
      ] keep ;
  
  MEMO: modes ( -- modes )
@@ -97,8 +96,8 @@ ERROR: mutually-recursive-rulesets ruleset ;
      ] if ;
  
  : finalize-mode ( rulesets -- )
-    rule-sets [
-        dup [ nip finalize-rule-set ] assoc-each
+    dup rule-sets [
+        [ nip finalize-rule-set ] assoc-each
      ] with-variable ;
  
  : load-mode ( name -- rule-sets )
diff --git a/basis/xmode/loader/loader.factor b/basis/xmode/loader/loader.factor

index ef1defc4da55f7ce27962fe6833152a8d46878f3..e5d5112a275b45c406d5c4261612b686c887f187 100644 (file)
--- a/basis/xmode/loader/loader.factor
+++ b/basis/xmode/loader/loader.factor
@@ -1,56 +1,54 @@
  USING: xmode.loader.syntax xmode.tokens xmode.rules
  xmode.keyword-map xml.data xml.traversal xml assocs kernel
  combinators sequences math.parser namespaces parser
-xmode.utilities parser-combinators.regexp io.files accessors ;
+xmode.utilities regexp io.files accessors xml.syntax ;
  IN: xmode.loader
  
  ! Based on org.gjt.sp.jedit.XModeHandler
  
  ! RULES and its children
-<TAGS: parse-rule-tag ( rule-set tag -- )
+TAGS: parse-rule-tag ( rule-set tag -- )
  
-TAG: PROPS
+TAG: PROPS parse-rule-tag
      parse-props-tag >>props drop ;
  
-TAG: IMPORT
+TAG: IMPORT parse-rule-tag
      "DELEGATE" attr swap import-rule-set ;
  
-TAG: TERMINATE
+TAG: TERMINATE parse-rule-tag
      "AT_CHAR" attr string>number >>terminate-char drop ;
  
-RULE: SEQ seq-rule
+RULE: SEQ seq-rule parse-rule-tag
      shared-tag-attrs delegate-attr literal-start ;
  
-RULE: SEQ_REGEXP seq-rule
+RULE: SEQ_REGEXP seq-rule parse-rule-tag
      shared-tag-attrs delegate-attr regexp-attr regexp-start ;
  
-RULE: SPAN span-rule
+RULE: SPAN span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr span-attrs parse-begin/end-tags init-span-tag ;
  
-RULE: SPAN_REGEXP span-rule
+RULE: SPAN_REGEXP span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr span-attrs regexp-attr parse-begin/end-tags init-span-tag ;
  
-RULE: EOL_SPAN eol-span-rule
+RULE: EOL_SPAN eol-span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr literal-start init-eol-span-tag ;
  
-RULE: EOL_SPAN_REGEXP eol-span-rule
+RULE: EOL_SPAN_REGEXP eol-span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr regexp-attr regexp-start init-eol-span-tag ;
  
-RULE: MARK_FOLLOWING mark-following-rule
+RULE: MARK_FOLLOWING mark-following-rule parse-rule-tag
      shared-tag-attrs match-type-attr literal-start ;
  
-RULE: MARK_PREVIOUS mark-previous-rule
+RULE: MARK_PREVIOUS mark-previous-rule parse-rule-tag
      shared-tag-attrs match-type-attr literal-start ;
  
-TAG: KEYWORDS ( rule-set tag -- key value )
+TAG: KEYWORDS parse-rule-tag
      rule-set get ignore-case?>> <keyword-map>
-    swap child-tags [ over parse-keyword-tag ] each
+    swap children-tags [ over parse-keyword-tag ] each
      swap (>>keywords) ;
  
-TAGS>
-
  : ?<regexp> ( string/f -- regexp/f )
-    dup [ rule-set get ignore-case?>> <regexp> ] when ;
+    dup [ rule-set get ignore-case?>> <?insensitive-regexp> ] when ;
  
  : (parse-rules-tag) ( tag -- rule-set )
      <rule-set> dup rule-set set
@@ -66,7 +64,7 @@ TAGS>
  
  : parse-rules-tag ( tag -- rule-set )
      [
-        [ (parse-rules-tag) ] [ child-tags ] bi
+        [ (parse-rules-tag) ] [ children-tags ] bi
          [ parse-rule-tag ] with each
          rule-set get
      ] with-scope ;
diff --git a/basis/xmode/loader/syntax/syntax.factor b/basis/xmode/loader/syntax/syntax.factor

index 0e7293da976f54d16fe4222a658580a736cbe570..60318e669e7fea9cffb97649a07d07c21a2236d7 100644 (file)
--- a/basis/xmode/loader/syntax/syntax.factor
+++ b/basis/xmode/loader/syntax/syntax.factor
@@ -3,7 +3,7 @@
  USING: accessors xmode.tokens xmode.rules xmode.keyword-map
  xml.data xml.traversal xml assocs kernel combinators sequences
  math.parser namespaces make parser lexer xmode.utilities
-parser-combinators.regexp io.files splitting arrays ;
+regexp io.files splitting arrays xml.syntax xml.syntax.private ;
  IN: xmode.loader.syntax
  
  ! Rule tag parsing utilities
@@ -11,9 +11,10 @@ IN: xmode.loader.syntax
      new swap init-from-tag swap add-rule ; inline
  
  : RULE:
-    scan scan-word
-    parse-definition { } make
-    swap [ (parse-rule-tag) ] 2curry (TAG:) ; parsing
+    scan scan-word scan-word [
+        parse-definition { } make
+        swap [ (parse-rule-tag) ] 2curry
+    ] dip swap define-tag ; parsing
  
  ! Attribute utilities
  : string>boolean ( string -- ? ) "TRUE" = ;
@@ -32,7 +33,7 @@ IN: xmode.loader.syntax
      [ "NAME" attr ] [ "VALUE" attr ] bi ;
  
  : parse-props-tag ( tag -- assoc )
-    child-tags
+    children-tags
      [ parse-prop-tag ] H{ } map>assoc ;
  
  : position-attrs ( tag -- at-line-start? at-whitespace-end? at-word-start? )
@@ -46,7 +47,8 @@ IN: xmode.loader.syntax
      swap position-attrs <matcher> ;
  
  : parse-regexp-matcher ( tag -- matcher )
-    dup children>string rule-set get ignore-case?>> <regexp>
+    dup children>string
+    rule-set get ignore-case?>> <?insensitive-regexp>
      swap position-attrs <matcher> ;
  
  : shared-tag-attrs ( -- )
@@ -79,22 +81,20 @@ IN: xmode.loader.syntax
      [ parse-literal-matcher >>end drop ] , ;
  
  ! SPAN's children
-<TAGS: parse-begin/end-tag ( rule tag -- )
+TAGS: parse-begin/end-tag ( rule tag -- )
  
-TAG: BEGIN
+TAG: BEGIN parse-begin/end-tag
      ! XXX
      parse-literal-matcher >>start drop ;
  
-TAG: END
+TAG: END parse-begin/end-tag
      ! XXX
      parse-literal-matcher >>end drop ;
  
-TAGS>
-
  : parse-begin/end-tags ( -- )
      [
          ! XXX: handle position attrs on span tag itself
-        child-tags [ parse-begin/end-tag ] with each
+        children-tags [ parse-begin/end-tag ] with each
      ] , ;
  
  : init-span-tag ( -- ) [ drop init-span ] , ;
diff --git a/basis/xmode/marker/marker.factor b/basis/xmode/marker/marker.factor

index cff0af2a981ed41c7cff5f2a8e6d8dc8585c5cb3..e106af79526eb2209de8a46fe9cbeb419df22c04 100755 (executable)
--- a/basis/xmode/marker/marker.factor
+++ b/basis/xmode/marker/marker.factor
@@ -4,8 +4,24 @@ IN: xmode.marker
  USING: kernel namespaces make xmode.rules xmode.tokens
  xmode.marker.state xmode.marker.context xmode.utilities
  xmode.catalog sequences math assocs combinators strings
-parser-combinators.regexp splitting parser-combinators ascii
+regexp splitting ascii regexp.backend unicode.case
  ascii combinators.short-circuit accessors ;
+! regexp.backend is for the regexp class
+
+! Next two words copied from parser-combinators
+! Just like head?, but they optionally ignore case
+
+: string= ( str1 str2 ignore-case -- ? )
+    [ [ >upper ] bi@ ] when sequence= ;
+
+: string-head? ( str1 str2 ignore-case -- ? )
+    2over shorter?
+    [ 3drop f ] [
+        [
+            [ nip ]
+            [ length head-slice ] 2bi
+        ] dip string=
+    ] if ;
  
  ! Based on org.gjt.sp.jedit.syntax.TokenMarker
  
@@ -150,7 +166,7 @@ M: escape-rule handle-rule-start
      process-escape? get [
          escaped? [ not ] change
          position [ + ] change
-    ] [ 2drop ] if ;
+    ] [ drop ] if ;
  
  M: seq-rule handle-rule-start
      ?end-rule
diff --git a/basis/xmode/rules/rules.factor b/basis/xmode/rules/rules.factor

index adc43d7bb6b6364521eb220c564af61dfbcd6436..99364fe7cd605808e74a0815b10149c2870db86e 100644 (file)
--- a/basis/xmode/rules/rules.factor
+++ b/basis/xmode/rules/rules.factor
@@ -1,6 +1,6 @@
  USING: accessors xmode.tokens xmode.keyword-map kernel
  sequences vectors assocs strings memoize unicode.case
-parser-combinators.regexp ;
+regexp regexp.backend ; ! regexp.backend has the regexp class
  IN: xmode.rules
  
  TUPLE: string-matcher string ignore-case? ;
diff --git a/basis/xmode/utilities/utilities-tests.factor b/basis/xmode/utilities/utilities-tests.factor

index 538c8cef6b22b7d5fa18f1d1dcbd7c21d1e8f9d2..338878942b0bc45e3bb8feda19a6834292cc0f70 100644 (file)
--- a/basis/xmode/utilities/utilities-tests.factor
+++ b/basis/xmode/utilities/utilities-tests.factor
@@ -1,45 +1,2 @@
+USING: assocs xmode.utilities tools.test ;
  IN: xmode.utilities.tests
-USING: accessors xmode.utilities tools.test xml xml.data kernel
-strings vectors sequences io.files prettyprint assocs
-unicode.case ;
-
-TUPLE: company employees type ;
-
-: <company> V{ } clone f company boa ;
-
-: add-employee employees>> push ;
-
-<TAGS: parse-employee-tag
-
-TUPLE: employee name description ;
-
-TAG: employee
-    employee new
-    { { "name" f (>>name) } { f (>>description) } }
-    init-from-tag swap add-employee ;
-
-TAGS>
-
-\ parse-employee-tag see
-
-: parse-company-tag
-    [
-        <company>
-        { { "type" >upper (>>type) } }
-        init-from-tag dup
-    ] keep
-    children>> [ tag? ] filter
-    [ parse-employee-tag ] with each ;
-
-[
-    T{ company f
-        V{
-            T{ employee f "Joe" "VP Sales" }
-            T{ employee f "Jane" "CFO" }
-        }
-        "PUBLIC"
-    }
-] [
-    "vocab:xmode/utilities/test.xml"
-    file>xml parse-company-tag
-] unit-test
diff --git a/basis/xmode/utilities/utilities.factor b/basis/xmode/utilities/utilities.factor

index f3e28bd4dab14d953a9f22b00b0e7b7ba3c83611..a7e42877aa2db5ed769d8e63a9343678755e90ff 100644 (file)
--- a/basis/xmode/utilities/utilities.factor
+++ b/basis/xmode/utilities/utilities.factor
@@ -1,11 +1,10 @@
  USING: accessors sequences assocs kernel quotations namespaces
-xml.data xml.traversal combinators macros parser lexer words fry ;
+xml.data xml.traversal combinators macros parser lexer words fry
+regexp ;
  IN: xmode.utilities
  
  : implies ( x y -- z ) [ not ] dip or ; inline
  
-: child-tags ( tag -- seq ) children>> [ tag? ] filter ;
-
  : tag-init-form ( spec -- quot )
      {
          { [ dup quotation? ] [ [ object get tag get ] prepose ] }
@@ -33,20 +32,5 @@ MACRO: (init-from-tag) ( specs -- )
  : init-from-tag ( tag tuple specs -- tuple )
      over [ (init-from-tag) ] dip ; inline
  
-SYMBOL: tag-handlers
-SYMBOL: tag-handler-word
-
-: <TAGS:
-    CREATE tag-handler-word set
-    H{ } clone tag-handlers set ; parsing
-
-: (TAG:) ( name quot -- ) swap tag-handlers get set-at ;
-
-: TAG:
-    scan parse-definition
-    (TAG:) ; parsing
-
-: TAGS>
-    tag-handler-word get
-    tag-handlers get >alist [ [ dup main>> ] dip case ] curry
-    define ; parsing
+: <?insensitive-regexp> ( string ? -- regexp )
+    "i" "" ? <optioned-regexp> ;
diff --git a/core/assocs/assocs.factor b/core/assocs/assocs.factor

index fdaa02e6c42cd4e8815cdc3181271f0de49f63d3..35e7535aa71f33007d651bbc0fbc06feac6fdd27 100755 (executable)
--- a/core/assocs/assocs.factor
+++ b/core/assocs/assocs.factor
@@ -41,7 +41,7 @@ M: assoc assoc-like drop ;
  : substituter ( assoc -- quot )
      [ ?at drop ] curry ; inline
  
-: with-assoc ( assoc quot: ( value key -- assoc ) -- quot: ( key value -- ) )
+: with-assoc ( assoc quot: ( value key assoc -- ) -- quot: ( key value -- ) )
      curry [ swap ] prepose ; inline
  
  PRIVATE>
diff --git a/extra/benchmark/regex-dna/regex-dna.factor b/extra/benchmark/regex-dna/regex-dna.factor

index 8c0aee596de53c0427b17abfc72ec597ff72ed02..29cb0b73578eceacdf2199c9b9d8c72334b6bdd2 100644 (file)
--- a/extra/benchmark/regex-dna/regex-dna.factor
+++ b/extra/benchmark/regex-dna/regex-dna.factor
@@ -1,7 +1,7 @@
  ! Copyright (C) 2008 Slava Pestov.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors regexp prettyprint io io.encodings.ascii
-io.files kernel sequences assocs namespaces ;
+USING: accessors regexp.matchers prettyprint io io.encodings.ascii
+io.files kernel sequences assocs namespaces regexp ;
  IN: benchmark.regex-dna
  
  ! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Tue, 3 Mar 2009 18:24:59 +0000 (12:24 -0600)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Tue, 3 Mar 2009 18:24:59 +0000 (12:24 -0600)
basis/ascii/ascii.factor		patch \| blob \| history
basis/globs/globs-tests.factor		patch \| blob \| history
basis/globs/globs.factor		patch \| blob \| history
basis/regexp/ast/ast.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/backend/backend.factor	[deleted file]	patch \| blob \| history
basis/regexp/classes/classes-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/combinators/authors.txt	[new file with mode: 0644]	patch \| blob
basis/regexp/combinators/combinators-docs.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/combinators/combinators-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/combinators/combinators.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/combinators/summary.txt	[new file with mode: 0644]	patch \| blob
basis/regexp/combinators/tags.txt	[new file with mode: 0644]	patch \| blob
basis/regexp/compiler/compiler.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/dfa/dfa-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/dfa/dfa.factor		patch \| blob \| history
basis/regexp/disambiguate/disambiguate.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/matchers/matchers.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/minimize/minimize-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/minimize/minimize.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/negation/negation-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/negation/negation.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/nfa/nfa.factor		patch \| blob \| history
basis/regexp/parser/parser-tests.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp-docs.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history
basis/regexp/regexp.factor		patch \| blob \| history
basis/regexp/transition-tables/transition-tables.factor		patch \| blob \| history
basis/regexp/traversal/traversal.factor		patch \| blob \| history
basis/regexp/utils/utils-tests.factor	[deleted file]	patch \| blob \| history
basis/regexp/utils/utils.factor	[deleted file]	patch \| blob \| history
basis/xmode/catalog/catalog.factor		patch \| blob \| history
basis/xmode/loader/loader.factor		patch \| blob \| history
basis/xmode/loader/syntax/syntax.factor		patch \| blob \| history
basis/xmode/marker/marker.factor		patch \| blob \| history
basis/xmode/rules/rules.factor		patch \| blob \| history
basis/xmode/utilities/utilities-tests.factor		patch \| blob \| history
basis/xmode/utilities/utilities.factor		patch \| blob \| history
core/assocs/assocs.factor		patch \| blob \| history
extra/benchmark/regex-dna/regex-dna.factor		patch \| blob \| history