Regexp negation (partial) and cleanup of regexp.nfa

author Daniel Ehrenberg <littledan@Macintosh-103.local>

Thu, 19 Feb 2009 22:48:46 +0000 (16:48 -0600)

committer Daniel Ehrenberg <littledan@Macintosh-103.local>

Thu, 19 Feb 2009 22:48:46 +0000 (16:48 -0600)
author Daniel Ehrenberg <littledan@Macintosh-103.local>
Thu, 19 Feb 2009 22:48:46 +0000 (16:48 -0600)
committer Daniel Ehrenberg <littledan@Macintosh-103.local>
Thu, 19 Feb 2009 22:48:46 +0000 (16:48 -0600)
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor

index d018fa3a3690e85d94c7b2168dc0032071c62436..ad67d76d12501005fc4617d30336997f69f353c3 100644 (file)
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@@ -16,11 +16,17 @@ C: <from-to> from-to
  TUPLE: at-least n ;
  C: <at-least> at-least
  
-TUPLE: concatenation seq ;
-C: <concatenation> concatenation
+SINGLETON: epsilon
  
-TUPLE: alternation seq ;
-C: <alternation> alternation
+TUPLE: concatenation first second ;
+
+: <concatenation> ( seq -- concatenation )
+    epsilon [ concatenation boa ] reduce ;
+
+TUPLE: alternation first second ;
+
+: <alternation> ( seq -- alternation )
+    unclip [ alternation boa ] reduce ;
  
  TUPLE: star term ;
  C: <star> star
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index 7109e8bcbdb2df994791dd9f70692ad2539fa187..44f33f9fcf1271506616e2e4a404cd2d3e968cd3 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -1,4 +1,4 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors kernel math math.order words
  ascii unicode.categories combinators.short-circuit sequences ;
@@ -41,9 +41,10 @@ C: <range> range
  
  GENERIC: class-member? ( obj class -- ? )
  
+! When does t get put in?
  M: t class-member? ( obj class -- ? ) 2drop f ;
  
-M: integer class-member? ( obj class -- ? ) 2drop f ;
+M: integer class-member? ( obj class -- ? ) = ;
  
  M: range class-member? ( obj class -- ? )
      [ from>> ] [ to>> ] bi between? ;
@@ -111,3 +112,15 @@ M: beginning-of-line class-member? ( obj class -- ? )
  
  M: end-of-line class-member? ( obj class -- ? )
      2drop f ;
+
+TUPLE: or-class seq ;
+C: <or-class> or-class
+
+TUPLE: not-class class ;
+C: <not-class> not-class
+
+M: or-class class-member?
+    seq>> [ class-member? ] with any? ;
+
+M: not-class class-member?
+    class>> class-member? not ;
diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor

index 52a852af5001dee3006cdb2bab86e2b0ca5e3653..163e87f2b4a52be8ddc63987f3c3073cff4d2f24 100644 (file)
--- a/basis/regexp/minimize/minimize.factor
+++ b/basis/regexp/minimize/minimize.factor
@@ -1,20 +1,48 @@
  ! Copyright (C) 2009 Daniel Ehrenberg
  ! See http://factorcode.org/license.txt for BSD license.
  USING: kernel sequences regexp.transition-tables fry assocs
-accessors locals math sorting arrays sets hashtables regexp.dfa  ;
+accessors locals math sorting arrays sets hashtables regexp.dfa
+combinators.short-circuit ;
  IN: regexp.minimize
  
+: number-transitions ( transitions numbering -- new-transitions )
+    dup '[
+        [ _ at ]
+        [ [ first _ at ] assoc-map ] bi*
+    ] assoc-map ;
+
+: table>state-numbers ( table -- assoc )
+    transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
+
+: map-set ( assoc quot -- new-assoc )
+    '[ drop @ dup ] assoc-map ; inline
+
+: rewrite-transitions ( transition-table assoc quot -- transition-table )
+    [
+        [ '[ _ at ] change-start-state ]
+        [ '[ [ _ at ] map-set ] change-final-states ]
+        [ ] tri
+    ] dip '[ _ @ ] change-transitions ; inline
+
+: number-states ( table -- newtable )
+    dup table>state-numbers
+    [ number-transitions ] rewrite-transitions ;
+
+: initially-same? ( s1 s2 transition-table -- ? )
+    {
+        [ drop <= ]
+        [ transitions>> '[ _ at keys ] bi@ set= ]
+        [ final-states>> '[ _ key? ] bi@ = ]
+    } 3&& ;
+
  :: initialize-partitions ( transition-table -- partitions )
      ! Partition table is sorted-array => ?
      H{ } clone :> out
      transition-table transitions>> keys :> states
      states [| s1 |
          states [| s2 |
-            s1 s2 <= [
-                s1 s2 [ transition-table transitions>> at keys ] bi@ set=
-                s1 s2 [ transition-table final-states>> key? ] bi@ = and
-                [ t s1 s2 2array out set-at ] when
-            ] when
+            s1 s2 transition-table initially-same?
+            [ s1 s2 2array out conjoin ] when
          ] each
      ] each out ;
  
@@ -29,7 +57,6 @@ IN: regexp.minimize
      '[ _ same-partition? ] assoc-all? ;
  
  : partition-more ( partitions transition-table -- partitions )
-    ! This is horribly slow!
      over '[ drop first2 _ _ stay-same? ] assoc-filter ;
  
  : partition>classes ( partitions -- synonyms ) ! old-state => new-state
@@ -40,7 +67,7 @@ IN: regexp.minimize
  
  : state-classes ( transition-table -- synonyms )
      [ initialize-partitions ] keep
-    '[ _ partition-more ] [ ] while-changes
+    '[ _ partition-more ] [ assoc-size ] while-changes
      partition>classes ;
  
  : canonical-state? ( state state-classes -- ? )
@@ -52,33 +79,12 @@ IN: regexp.minimize
  : rewrite-duplicates ( new-transitions state-classes -- new-transitions )
      '[ [ _ at ] assoc-map ] assoc-map ;
  
-: map-set ( assoc quot -- new-assoc )
-    '[ drop @ dup ] assoc-map ; inline
+: combine-transitions ( transitions state-classes -- new-transitions )
+    [ delete-duplicates ] [ rewrite-duplicates ] bi ;
  
  : combine-states ( table -- smaller-table )
      dup state-classes
-    [
-        '[
-            _ [ delete-duplicates ]
-            [ rewrite-duplicates ] bi
-        ] change-transitions
-    ]
-    [ '[ [ _ at ] map-set ] change-final-states ]
-    [ '[ _ at ] change-start-state ]
-    tri ;
-
-: number-transitions ( transitions numbering -- new-transitions )
-    [
-        [ at ]
-        [ '[ first _ at ] assoc-map ]
-        bi-curry bi*
-    ] curry assoc-map ;
-
-: number-states ( table -- newtable )
-    dup transitions>> keys <enum> [ swap ] H{ } assoc-map-as
-    [ '[ _ at ] change-start-state ]
-    [ '[ [ _ at ] map-set ] change-final-states ]
-    [ '[ _ number-transitions ] change-transitions ] tri ;
+    [ combine-transitions ] rewrite-transitions ;
  
  : minimize ( table -- minimal-table )
      clone number-states combine-states ;
diff --git a/basis/regexp/negation/negation-tests.factor b/basis/regexp/negation/negation-tests.factor

new file mode 100644 (file)

index 0000000..2dbca2e
--- /dev/null
+++ b/basis/regexp/negation/negation-tests.factor
@@ -0,0 +1,27 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
+IN: regexp.negation.tests
+
+[
+    ! R/ |[^a]|.+/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } }
+            { 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } }
+            { -1 H{ { any-char -1 } } }
+        } } 
+        { start-state 0 }
+        { final-states H{ { 0 0 } { -1 -1 } } }
+    }
+] [
+    ! R/ a/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } } }
+            { 1 H{ } } 
+        } }
+        { start-state 0 }
+        { final-states H{ { 1 1 } } }
+    } negate-table
+] unit-test
diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor

new file mode 100644 (file)

index 0000000..5a9f772
--- /dev/null
+++ b/basis/regexp/negation/negation.factor
@@ -0,0 +1,36 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences
+assocs regexp.classes hashtables accessors ;
+IN: regexp.negation
+
+: ast>dfa ( parse-tree -- minimal-dfa )
+    construct-nfa construct-dfa minimize ;
+
+CONSTANT: fail-state -1
+
+: add-default-transition ( state's-transitions -- new-state's-transitions )
+    clone dup
+    [ [ fail-state ] dip keys <or-class> <not-class> ] keep set-at ;
+
+: fail-state-recurses ( transitions -- new-transitions )
+    clone dup
+    [ fail-state any-char associate fail-state ] dip set-at ;
+
+: add-fail-state ( transitions -- new-transitions )
+    [ add-default-transition ] assoc-map
+    fail-state-recurses ;
+
+: assoc>set ( assoc -- keys-set )
+    [ drop dup ] assoc-map ;
+
+: inverse-final-states ( transition-table -- final-states )
+    [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
+
+: negate-table ( transition-table -- transition-table )
+    clone
+        [ add-fail-state ] change-transitions
+        dup inverse-final-states >>final-states ;
+
+! M: negation nfa-node ( node -- )
+!     ast>dfa negate-table adjoin-dfa ;
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor

index 4ad5e0314d701b62fbdabf9f1658296c65fda340..c759ffdf98d7be84730fd9c0ce97ae9a7bf1d3ae 100644 (file)
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -3,15 +3,13 @@
  USING: accessors arrays assocs grouping kernel
  locals math namespaces sequences fry quotations
  math.order math.ranges vectors unicode.categories
-regexp.transition-tables words sets 
+regexp.transition-tables words sets hashtables
  unicode.case.private regexp.ast regexp.classes ;
  ! This uses unicode.case.private for ch>upper and ch>lower
  ! but case-insensitive matching should be done by case-folding everything
  ! before processing starts
  IN: regexp.nfa
  
-ERROR: feature-is-broken feature ;
-
  SYMBOL: negated?
  
  : negate ( -- )
@@ -21,14 +19,13 @@ SINGLETON: eps
  
  SYMBOL: option-stack
  
-SYMBOL: combine-stack
-
  SYMBOL: state
  
  : next-state ( -- state )
      state [ get ] [ inc ] bi ;
  
  SYMBOL: nfa-table
+: table ( -- table ) nfa-table get ;
  
  : set-each ( keys value hashtable -- )
      '[ _ swap _ set-at ] each ;
@@ -46,84 +43,56 @@ SYMBOL: nfa-table
  : option? ( obj -- ? )
      option-stack get assoc-stack ;
  
-: set-start-state ( -- nfa-table )
-    nfa-table get
-        combine-stack get pop first >>start-state ;
-
-GENERIC: nfa-node ( node -- )
+GENERIC: nfa-node ( node -- start-state end-state )
  
-:: add-simple-entry ( obj class -- )
-    [let* | s0 [ next-state ]
-            s1 [ next-state ]
-            stack [ combine-stack get ]
-            table [ nfa-table get ] |
-        negated? get [
-            s0 f obj class make-transition table add-transition
-            s0 s1 <default-transition> table add-transition
-        ] [
-            s0 s1 obj class make-transition table add-transition
-        ] if
-        s0 s1 2array stack push
-        t s1 table final-states>> set-at ] ;
-
-:: concatenate-nodes ( -- )
-    [let* | stack [ combine-stack get ]
-            table [ nfa-table get ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ] |
-        s1 s2 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s0 s3 2array stack push ] ;
-
-:: alternate-nodes ( -- )
-    [let* | stack [ combine-stack get ]
-            table [ nfa-table get ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s4 [ next-state ]
-            s5 [ next-state ] |
-        s4 s0 eps <literal-transition> table add-transition
-        s4 s2 eps <literal-transition> table add-transition
-        s1 s5 eps <literal-transition> table add-transition
-        s3 s5 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s3 table final-states>> delete-at
-        t s5 table final-states>> set-at
-        s4 s5 2array stack push ] ;
-
-M: star nfa-node ( node -- )
-    term>> nfa-node
-    [let* | stack [ combine-stack get ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s2 [ next-state ]
-            s3 [ next-state ]
-            table [ nfa-table get ] |
-        s1 table final-states>> delete-at
-        t s3 table final-states>> set-at
-        s1 s0 eps <literal-transition> table add-transition
-        s2 s0 eps <literal-transition> table add-transition
-        s2 s3 eps <literal-transition> table add-transition
-        s1 s3 eps <literal-transition> table add-transition
-        s2 s3 2array stack push ] ;
-
-M: concatenation nfa-node ( node -- )
-    seq>> [ eps literal-transition add-simple-entry ] [
-        reversed-regexp option? [ <reversed> ] when
-        [ [ nfa-node ] each ]
-        [ length 1- [ concatenate-nodes ] times ] bi
-    ] if-empty ;
-
-M: alternation nfa-node ( node -- )
-    seq>>
-    [ [ nfa-node ] each ]
-    [ length 1- [ alternate-nodes ] times ] bi ;
-
-M: integer nfa-node ( node -- )
+:: add-simple-entry ( obj class -- start-state end-state )
+    next-state :> s0
+    next-state :> s1
+    negated? get [
+        s0 f obj class make-transition table add-transition
+        s0 s1 <default-transition> table add-transition
+    ] [
+        s0 s1 obj class make-transition table add-transition
+    ] if
+    s0 s1 ;
+
+: epsilon-transition ( source target -- )
+    eps <literal-transition> table add-transition ;
+
+M:: star nfa-node ( node -- start end )
+    node term>> nfa-node :> s1 :> s0
+    next-state :> s2
+    next-state :> s3
+    s1 s0 epsilon-transition
+    s2 s0 epsilon-transition
+    s2 s3 epsilon-transition
+    s1 s3 epsilon-transition
+    s2 s3 ;
+
+M: epsilon nfa-node
+    drop eps literal-transition add-simple-entry ;
+
+M: concatenation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    reversed-regexp option? [ swap ] when
+    [ nfa-node ] bi@
+    [ epsilon-transition ] dip ;
+
+:: alternate-nodes ( s0 s1 s2 s3 -- start end )
+    next-state :> s4
+    next-state :> s5
+    s4 s0 epsilon-transition
+    s4 s2 epsilon-transition
+    s1 s5 epsilon-transition
+    s3 s5 epsilon-transition
+    s4 s5 ;
+
+M: alternation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    [ nfa-node ] bi@
+    alternate-nodes ;
+
+M: integer nfa-node ( node -- start end )
      case-insensitive option? [
          dup [ ch>lower ] [ ch>upper ] bi
          2dup = [
@@ -131,26 +100,26 @@ M: integer nfa-node ( node -- )
              literal-transition add-simple-entry
          ] [
              [ literal-transition add-simple-entry ] bi@
-            alternate-nodes drop
+            alternate-nodes [ nip ] dip
          ] if
      ] [
          literal-transition add-simple-entry
      ] if ;
  
-M: primitive-class nfa-node ( node -- )
+M: primitive-class nfa-node ( node -- start end )
      class>> dup
      { letter-class LETTER-class } member? case-insensitive option? and
      [ drop Letter-class ] when
      class-transition add-simple-entry ;
  
-M: any-char nfa-node ( node -- )
+M: any-char nfa-node ( node -- start end )
      [ dotall option? ] dip any-char-no-nl ?
      class-transition add-simple-entry ;
  
-M: negation nfa-node ( node -- )
+M: negation nfa-node ( node -- start end )
      negate term>> nfa-node negate ;
  
-M: range nfa-node ( node -- )
+M: range nfa-node ( node -- start end )
      case-insensitive option? [
          ! This should be implemented for Unicode by case-folding
          ! the input and all strings in the regexp.
@@ -169,15 +138,16 @@ M: range nfa-node ( node -- )
          class-transition add-simple-entry
      ] if ;
  
-M: with-options nfa-node ( node -- )
+M: with-options nfa-node ( node -- start end )
      dup options>> [ tree>> nfa-node ] using-options ;
  
  : construct-nfa ( ast -- nfa-table )
      [
          negated? off
-        V{ } clone combine-stack set
          0 state set
          <transition-table> clone nfa-table set
          nfa-node
-        set-start-state
+        table
+            swap dup associate >>final-states
+            swap >>start-state
      ] with-scope ;
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index dbd37f2d8ec1e703ede79d35067a730ce7e41621..6b2f28dbf6899982e0fadd1c3877be7521eccb0e 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -4,6 +4,7 @@ USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
  combinators regexp.classes strings splitting peg locals accessors
  regexp.ast ;
  IN: regexp.parser
+
  : allowed-char? ( ch -- ? )
      ".()|[*+?" member? not ;
  
@@ -130,6 +131,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
              | "?" Options:on "-"? Options:off ":" Alternation:a
                  => [[ a on off parse-options <with-options> ]]
              | "?#" [^)]* => [[ f ]]
+            | "?~" Alternation:a => [[ a <negation> ]]
              | Alternation
  
  Element = "(" Parenthized:p ")" => [[ p ]]
diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor

index b6fd32a24538e5b9323b6ea685f3c0504f738174..189d430d85950a3adbee180733ce5de3996b5a4d 100644 (file)
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@@ -4,14 +4,15 @@ USING: accessors combinators kernel math sequences strings sets
  assocs prettyprint.backend prettyprint.custom make lexer
  namespaces parser arrays fry locals regexp.minimize
  regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting regexp.ast ;
+regexp.transition-tables splitting sorting regexp.ast
+regexp.negation ;
  IN: regexp
  
  TUPLE: regexp raw parse-tree options dfa ;
  
  : <optioned-regexp> ( string options -- regexp )
      [ dup parse-regexp ] [ string>options ] bi*
-    2dup <with-options> construct-nfa construct-dfa minimize
+    2dup <with-options> ast>dfa
      regexp boa ;
  
  : <regexp> ( string -- regexp ) "" <optioned-regexp> ;
author	Daniel Ehrenberg <littledan@Macintosh-103.local>
	Thu, 19 Feb 2009 22:48:46 +0000 (16:48 -0600)
committer	Daniel Ehrenberg <littledan@Macintosh-103.local>
	Thu, 19 Feb 2009 22:48:46 +0000 (16:48 -0600)
basis/regexp/ast/ast.factor		patch \| blob \| history
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/minimize/minimize.factor		patch \| blob \| history
basis/regexp/negation/negation-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/negation/negation.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/nfa/nfa.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp.factor		patch \| blob \| history