Various regexp changes, including the addition of regexp combinators

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Wed, 25 Feb 2009 18:22:12 +0000 (12:22 -0600)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Wed, 25 Feb 2009 18:22:12 +0000 (12:22 -0600)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Wed, 25 Feb 2009 18:22:12 +0000 (12:22 -0600)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Wed, 25 Feb 2009 18:22:12 +0000 (12:22 -0600)
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor

index 65748005f485fcace802a547c83039f6c37f2307..b804eacc090d65790d2df8a979b5ae2449393f52 100644 (file)
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@@ -13,7 +13,10 @@ C: <from-to> from-to
  TUPLE: at-least n ;
  C: <at-least> at-least
  
-SINGLETON: epsilon
+TUPLE: tagged-epsilon tag ;
+C: <tagged-epsilon> tagged-epsilon
+
+CONSTANT: epsilon T{ tagged-epsilon }
  
  TUPLE: concatenation first second ;
  
@@ -60,3 +63,10 @@ C: <lookahead> lookahead
  
  TUPLE: lookbehind term ;
  C: <lookbehind> lookbehind
+
+TUPLE: possessive-star term ;
+C: <possessive-star> possessive-star
+
+: <possessive-plus> ( term -- term' )
+    dup <possessive-star> 2array <concatenation> ;
+
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index 6e68e9e0f6da66449244914352b0e6ed753859ef..0990ac786b5f3c3542949fb1133a817e98c23749 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -12,8 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
  control-character-class hex-digit-class java-blank-class c-identifier-class
  unmatchable-class terminator-class word-boundary-class ;
  
-SINGLETONS: beginning-of-input beginning-of-line
-end-of-input end-of-line ;
+SINGLETONS: beginning-of-input ^ end-of-input $ ;
  
  TUPLE: range from to ;
  C: <range> range
@@ -100,10 +99,10 @@ M: unmatchable-class class-member? ( obj class -- ? )
  M: terminator-class class-member? ( obj class -- ? )
      drop "\r\n\u000085\u002029\u002028" member? ;
  
-M: beginning-of-line class-member? ( obj class -- ? )
+M: ^ class-member? ( obj class -- ? )
      2drop f ;
  
-M: end-of-line class-member? ( obj class -- ? )
+M: $ class-member? ( obj class -- ? )
      2drop f ;
  
  M: f class-member? 2drop f ;
diff --git a/basis/regexp/combinators/combinators-tests.factor b/basis/regexp/combinators/combinators-tests.factor

new file mode 100644 (file)

index 0000000..dc6b5a6
--- /dev/null
+++ b/basis/regexp/combinators/combinators-tests.factor
@@ -0,0 +1,29 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.combinators tools.test regexp kernel sequences ;
+IN: regexp.combinators.tests
+
+: strings ( -- regexp )
+    { "foo" "bar" "baz" } <any-of> ;
+
+[ t t t ] [ "foo" "bar" "baz" [ strings matches? ] tri@ ] unit-test
+[ f f f ] [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
+
+: conj ( -- regexp )
+    { R/ .*a/ R/ b.*/ } <and> ;
+
+[ t ] [ "bljhasflsda" conj matches? ] unit-test
+[ f ] [ "bsdfdfs" conj matches? ] unit-test ! why does this fail?
+[ f ] [ "fsfa" conj matches? ] unit-test
+
+! For some reason, creating this DFA doesn't work
+! [ f ] [ "bljhasflsda" conj <not> matches? ] unit-test
+! [ t ] [ "bsdfdfs" conj <not> matches? ] unit-test
+! [ t ] [ "fsfa" conj <not> matches? ] unit-test
+
+[ f f ] [ "" "hi" [ <nothing> matches? ] bi@ ] unit-test
+[ t t ] [ "" "hi" [ <nothing> <not> matches? ] bi@ ] unit-test
+
+[ { t t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <zero-or-more> matches? ] map ] unit-test
+[ { f t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <one-or-more> matches? ] map ] unit-test
+[ { t t f f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <option> matches? ] map ] unit-test
diff --git a/basis/regexp/combinators/combinators.factor b/basis/regexp/combinators/combinators.factor

new file mode 100644 (file)

index 0000000..e6b35c5
--- /dev/null
+++ b/basis/regexp/combinators/combinators.factor
@@ -0,0 +1,48 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp sequences kernel regexp.negation regexp.ast
+accessors fry ;
+IN: regexp.combinators
+
+: <nothing> ( -- regexp )
+    R/ (?~.*)/ ;
+
+: <literal> ( string -- regexp )
+    [ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ;
+
+: <or> ( regexps -- disjunction )
+    [ [ raw>> "(" ")" surround ] map "|" join ]
+    [ [ parse-tree>> ] map <alternation> ] bi
+    make-regexp ;
+
+: <any-of> ( strings -- regexp )
+    [ <literal> ] map <or> ;
+
+: <sequence> ( regexps -- regexp )
+    [ [ raw>> ] map concat ]
+    [ [ parse-tree>> ] map <concatenation> ] bi
+    make-regexp ;
+
+: modify-regexp ( regexp raw-quot tree-quot -- new-regexp )
+    [ '[ raw>> @ ] ]
+    [ '[ parse-tree>> @ ] ] bi* bi
+    make-regexp ; inline
+
+: <not> ( regexp -- not-regexp )
+    [ "(?~" ")" surround ]
+    [ <negation> ] modify-regexp ;
+
+: <and> ( regexps -- conjunction )
+    [ <not> ] map <or> <not> ;
+
+: <zero-or-more> ( regexp -- regexp* )
+    [ "(" ")*" surround ]
+    [ <star> ] modify-regexp ;
+
+: <one-or-more> ( regexp -- regexp+ )
+    [ "(" ")+" surround ]
+    [ <plus> ] modify-regexp ;
+
+: <option> ( regexp -- regexp? )
+    [ "(" ")?" surround ]
+    [ <maybe> ] modify-regexp ;
diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor

index 8c2e99516381f1f108546fb24dbd611ab4e32759..acf59b06374ece4025e72b9f7aa274657179ee07 100644 (file)
--- a/basis/regexp/dfa/dfa.factor
+++ b/basis/regexp/dfa/dfa.factor
@@ -2,7 +2,7 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors arrays assocs combinators fry kernel locals
  math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors ;
+sets sorting vectors regexp.ast ;
  IN: regexp.dfa
  
  :: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
@@ -20,7 +20,7 @@ IN: regexp.dfa
      transitions>> '[ _ swap _ at at ] gather sift ;
  
  : (find-epsilon-closure) ( states nfa -- new-states )
-    eps swap find-delta ;
+    epsilon swap find-delta ;
  
  : find-epsilon-closure ( states nfa -- new-states )
      '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
@@ -35,7 +35,7 @@ IN: regexp.dfa
  : find-transitions ( dfa-state nfa -- next-dfa-state )
      transitions>>
      '[ _ at keys ] gather
-    eps swap remove ;
+    epsilon swap remove ;
  
  : add-todo-state ( state visited-states new-states -- )
      3dup drop key? [ 3drop ] [
diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor

index f5a43a2a5e9aebec91671ec4ca05c52cbb8a10e2..67e77ac7ca46d62fb7a8736cd0e235f3dc426ab1 100644 (file)
--- a/basis/regexp/negation/negation.factor
+++ b/basis/regexp/negation/negation.factor
@@ -2,7 +2,7 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: regexp.nfa regexp.disambiguate kernel sequences
  assocs regexp.classes hashtables accessors fry vectors
-regexp.ast regexp.transition-tables regexp.minimize ;
+regexp.ast regexp.transition-tables regexp.minimize namespaces ;
  IN: regexp.negation
  
  : ast>dfa ( parse-tree -- minimal-dfa )
@@ -48,14 +48,14 @@ CONSTANT: fail-state -1
  
  : unify-final-state ( transition-table -- transition-table )
      dup [ final-states>> keys ] keep
-    '[ -2 eps <literal-transition> _ add-transition ] each
+    '[ -2 epsilon <literal-transition> _ add-transition ] each
      H{ { -2 -2 } } >>final-states ;
  
  : adjoin-dfa ( transition-table -- start end )
      box-transitions unify-final-state renumber-states
      [ start-state>> ]
      [ final-states>> keys first ]
-    [ table [ transitions>> ] bi@ swap update ] tri ;
+    [ nfa-table get [ transitions>> ] bi@ swap update ] tri ;
  
  M: negation nfa-node ( node -- start end )
      term>> ast>dfa negate-table adjoin-dfa ;
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor

index 72ce880f8bcc31e8458a82fe71a9ac98f11237e8..636268116806c4221dd66834c68061ec26bc79e4 100644 (file)
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -1,4 +1,4 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors arrays assocs grouping kernel
  locals math namespaces sequences fry quotations
@@ -24,8 +24,6 @@ M: alternation remove-lookahead
  
  M: concatenation remove-lookahead ;
  
-SINGLETON: eps
-
  SYMBOL: option-stack
  
  SYMBOL: state
@@ -34,7 +32,6 @@ SYMBOL: state
      state [ get ] [ inc ] bi ;
  
  SYMBOL: nfa-table
-: table ( -- table ) nfa-table get ;
  
  : set-each ( keys value hashtable -- )
      '[ _ swap _ set-at ] each ;
@@ -56,10 +53,10 @@ GENERIC: nfa-node ( node -- start-state end-state )
  
  : add-simple-entry ( obj class -- start-state end-state )
      [ next-state next-state 2dup ] 2dip
-    make-transition table add-transition ;
+    make-transition nfa-table get add-transition ;
  
  : epsilon-transition ( source target -- )
-    eps <literal-transition> table add-transition ;
+    epsilon <literal-transition> nfa-table get add-transition ;
  
  M:: star nfa-node ( node -- start end )
      node term>> nfa-node :> s1 :> s0
@@ -71,8 +68,8 @@ M:: star nfa-node ( node -- start end )
      s1 s3 epsilon-transition
      s2 s3 ;
  
-M: epsilon nfa-node
-    drop eps literal-transition add-simple-entry ;
+M: tagged-epsilon nfa-node
+    literal-transition add-simple-entry ;
  
  M: concatenation nfa-node ( node -- start end )
      [ first>> ] [ second>> ] bi
@@ -154,7 +151,7 @@ M: with-options nfa-node ( node -- start end )
          0 state set
          <transition-table> nfa-table set
          remove-lookahead nfa-node
-        table
+        nfa-table get
              swap dup associate >>final-states
              swap >>start-state
      ] with-scope ;
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index 56c6b1eb04cd53bc096094ec2f124cebdb44b519..ed0762cc3ab39bb61b3f46c870345f1178ecc39d 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -6,7 +6,7 @@ regexp.ast ;
  IN: regexp.parser
  
  : allowed-char? ( ch -- ? )
-    ".()|[*+?" member? not ;
+    ".()|[*+?$^" member? not ;
  
  ERROR: bad-number ;
  
@@ -53,6 +53,8 @@ ERROR: bad-class name ;
          { CHAR: d [ digit-class <primitive-class> ] }
          { CHAR: D [ digit-class <primitive-class> <not-class> ] }
  
+        { CHAR: z [ end-of-input <tagged-epsilon> ] }
+        { CHAR: A [ beginning-of-input <tagged-epsilon> ] }
          [ ]
      } case ;
  
@@ -109,7 +111,10 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-cl
  
  EscapeSequence = "\\" Escape:e => [[ e ]]
  
-Character = EscapeSequence | . ?[ allowed-char? ]?
+Character = EscapeSequence
+          | "$" => [[ $ <tagged-epsilon> ]]
+          | "^" => [[ ^ <tagged-epsilon> ]]
+          | . ?[ allowed-char? ]?
  
  AnyRangeCharacter = EscapeSequence | .
  
@@ -152,6 +157,8 @@ Times = "," Number:n "}" => [[ 0 n <from-to> ]]
        | Number:n "," Number:m "}" => [[ n m <from-to> ]]
  
  Repeated = Element:e "{" Times:t => [[ e t <times> ]]
+         | Element:e "*+" => [[ e <possessive-star> ]]
+         | Element:e "++" => [[ e <possessive-plus> ]]
           | Element:e "?" => [[ e <maybe> ]]
           | Element:e "*" => [[ e <star> ]]
           | Element:e "+" => [[ e <plus> ]]
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index 0d9ed129c8546b6427bd8b875b8e392ce3ab25ba..54bc305b4f36a5a87527f385de4d08ed9e2896af 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -1,5 +1,5 @@
  USING: regexp tools.test kernel sequences regexp.parser
-regexp.traversal eval strings multiline ;
+regexp.traversal eval strings multiline accessors ;
  IN: regexp-tests
  
  \ <regexp> must-infer
@@ -332,6 +332,16 @@ IN: regexp-tests
  ! Intersecting classes
  [ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
  [ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
+[ t ] [ "ac" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ac" R/ ac|[a-z]b/ matches? ] unit-test
+[ t ] [ "ac" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "πb" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ f ] [ "πc" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ f ] [ "Ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+
+[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
+[ t ] [ R/ foo/ dfa>> >boolean ] unit-test
  
  ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
  ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor

index 189d430d85950a3adbee180733ce5de3996b5a4d..55a9800254fd9d0b8a8303866f0fcf916cfa9222 100644 (file)
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@@ -1,4 +1,4 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors combinators kernel math sequences strings sets
  assocs prettyprint.backend prettyprint.custom make lexer
@@ -10,17 +10,28 @@ IN: regexp
  
  TUPLE: regexp raw parse-tree options dfa ;
  
+: make-regexp ( string ast -- regexp )
+    f f <options> f regexp boa ;
+
  : <optioned-regexp> ( string options -- regexp )
      [ dup parse-regexp ] [ string>options ] bi*
-    2dup <with-options> ast>dfa
-    regexp boa ;
+    f regexp boa ;
  
  : <regexp> ( string -- regexp ) "" <optioned-regexp> ;
  
  <PRIVATE
  
+: get-dfa ( regexp -- dfa )
+    dup dfa>> [ ] [
+        dup 
+        [ parse-tree>> ]
+        [ options>> ] bi
+        <with-options> ast>dfa
+        [ >>dfa drop ] keep
+    ] ?if ;
+
  : (match) ( string regexp -- dfa-traverser )
-    dfa>> <dfa-traverser> do-match ; inline
+    get-dfa <dfa-traverser> do-match ; inline
  
  PRIVATE>
  
@@ -97,7 +108,7 @@ PRIVATE>
      [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
      lexer get dup still-parsing-line?
      [ (parse-token) ] [ drop f ] if
-    <optioned-regexp> parsed ;
+    <optioned-regexp> dup get-dfa drop parsed ;
  
  PRIVATE>
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Wed, 25 Feb 2009 18:22:12 +0000 (12:22 -0600)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Wed, 25 Feb 2009 18:22:12 +0000 (12:22 -0600)
basis/regexp/ast/ast.factor		patch \| blob \| history
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/combinators/combinators-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/combinators/combinators.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/dfa/dfa.factor		patch \| blob \| history
basis/regexp/negation/negation.factor		patch \| blob \| history
basis/regexp/nfa/nfa.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history
basis/regexp/regexp.factor		patch \| blob \| history