Merge branch 'master' of git://factorcode.org/git/factor into regexp

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Mon, 23 Feb 2009 19:10:54 +0000 (13:10 -0600)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Mon, 23 Feb 2009 19:10:54 +0000 (13:10 -0600)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Mon, 23 Feb 2009 19:10:54 +0000 (13:10 -0600)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Mon, 23 Feb 2009 19:10:54 +0000 (13:10 -0600)
diff --git a/basis/ascii/ascii.factor b/basis/ascii/ascii.factor

index 193e847d2714ee868e2e195373a067557bcf6b89..bd1b86b2793347fcf56dfa1923b9b87a4184a508 100644 (file)
--- a/basis/ascii/ascii.factor
+++ b/basis/ascii/ascii.factor
@@ -10,7 +10,7 @@ IN: ascii
  : LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline\r
  : digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline\r
  : printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline\r
-: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline\r
+: control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline\r
  : quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline\r
  : Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline\r
  : alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline\r
@@ -20,4 +20,4 @@ IN: ascii
  : >upper ( str -- upper ) [ ch>upper ] map ;\r
  \r
  HINTS: >lower string ;\r
-HINTS: >upper string ;
-\ No newline at end of file
+HINTS: >upper string ;\r
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor

new file mode 100644 (file)

index 0000000..6574800
--- /dev/null
+++ b/basis/regexp/ast/ast.factor
@@ -0,0 +1,62 @@
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel arrays accessors fry sequences regexp.classes ;
+FROM: math.ranges => [a,b] ;
+IN: regexp.ast
+
+TUPLE: negation term ;
+C: <negation> negation
+
+TUPLE: from-to n m ;
+C: <from-to> from-to
+
+TUPLE: at-least n ;
+C: <at-least> at-least
+
+SINGLETON: epsilon
+
+TUPLE: concatenation first second ;
+
+: <concatenation> ( seq -- concatenation )
+    [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
+
+TUPLE: alternation first second ;
+
+: <alternation> ( seq -- alternation )
+    unclip [ alternation boa ] reduce ;
+
+TUPLE: star term ;
+C: <star> star
+
+TUPLE: with-options tree options ;
+C: <with-options> with-options
+
+TUPLE: options on off ;
+C: <options> options
+
+SINGLETONS: unix-lines dotall multiline comments case-insensitive
+unicode-case reversed-regexp ;
+
+: <maybe> ( term -- term' )
+    f <concatenation> 2array <alternation> ;
+
+: <plus> ( term -- term' )
+    dup <star> 2array <concatenation> ;
+
+: repetition ( n term -- term' )
+    <array> <concatenation> ;
+
+GENERIC: <times> ( term times -- term' )
+M: at-least <times>
+    n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
+M: from-to <times>
+    [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
+
+: char-class ( ranges ? -- term )
+    [ <or-class> ] dip [ <not-class> ] when ;
+
+TUPLE: lookahead term ;
+C: <lookahead> lookahead
+
+TUPLE: lookbehind term ;
+C: <lookbehind> lookbehind
diff --git a/basis/regexp/backend/backend.factor b/basis/regexp/backend/backend.factor

deleted file mode 100644 (file)

index 5eff057..0000000
--- a/basis/regexp/backend/backend.factor
+++ /dev/null
@@ -1,27 +0,0 @@
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors hashtables kernel math vectors ;
-IN: regexp.backend
-
-TUPLE: regexp
-    raw
-    { options hashtable }
-    stack
-    parse-tree
-    nfa-table
-    dfa-table
-    minimized-table
-    matchers
-    { nfa-traversal-flags hashtable }
-    { dfa-traversal-flags hashtable }
-    { state integer }
-    { new-states vector }
-    { visited-states hashtable } ;
-
-: reset-regexp ( regexp -- regexp )
-    0 >>state
-    V{ } clone >>stack
-    V{ } clone >>new-states
-    H{ } clone >>visited-states ;
-
-SYMBOL: current-regexp
diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor

new file mode 100644 (file)

index 0000000..4cbb2e7
--- /dev/null
+++ b/basis/regexp/classes/classes-tests.factor
@@ -0,0 +1,25 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.classes tools.test arrays kernel ;
+IN: regexp.classes.tests
+
+[ f ] [ { 1 2 } <and-class> ] unit-test
+[ T{ or-class f { 2 1 } } ] [ { 1 2 } <or-class> ] unit-test
+[ 3 ] [ { 1 2 } <and-class> 3 2array <or-class> ] unit-test
+[ CHAR: A ] [ CHAR: A LETTER-class <primitive-class> 2array <and-class> ] unit-test
+[ CHAR: A ] [ LETTER-class <primitive-class> CHAR: A 2array <and-class> ] unit-test
+[ T{ primitive-class { class LETTER-class } } ] [ CHAR: A LETTER-class <primitive-class> 2array <or-class> ] unit-test
+[ T{ primitive-class { class LETTER-class } } ] [ LETTER-class <primitive-class> CHAR: A 2array <or-class> ] unit-test
+[ t ] [ { t 1 } <or-class> ] unit-test
+[ t ] [ { 1 t } <or-class> ] unit-test
+[ f ] [ { f 1 } <and-class> ] unit-test
+[ f ] [ { 1 f } <and-class> ] unit-test
+[ 1 ] [ { f 1 } <or-class> ] unit-test
+[ 1 ] [ { 1 f } <or-class> ] unit-test
+[ 1 ] [ { t 1 } <and-class> ] unit-test
+[ 1 ] [ { 1 t } <and-class> ] unit-test
+[ 1 ] [ 1 <not-class> <not-class> ] unit-test
+[ 1 ] [ { 1 1 } <and-class> ] unit-test
+[ 1 ] [ { 1 1 } <or-class> ] unit-test
+[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <and-class> ] unit-test
+[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <or-class> ] unit-test
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index 4a807fa51bbc0f815282c086e77d136517707b69..6e68e9e0f6da66449244914352b0e6ed753859ef 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -1,7 +1,8 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel math math.order words regexp.utils
-unicode.categories combinators.short-circuit ;
+USING: accessors kernel math math.order words combinators locals
+ascii unicode.categories combinators.short-circuit sequences ;
+QUALIFIED-WITH: multi-methods m
  IN: regexp.classes
  
  SINGLETONS: any-char any-char-no-nl
@@ -14,16 +15,16 @@ unmatchable-class terminator-class word-boundary-class ;
  SINGLETONS: beginning-of-input beginning-of-line
  end-of-input end-of-line ;
  
-MIXIN: node
-TUPLE: character-class-range from to ; INSTANCE: character-class-range node
+TUPLE: range from to ;
+C: <range> range
  
  GENERIC: class-member? ( obj class -- ? )
  
-M: t class-member? ( obj class -- ? ) 2drop f ;
+M: t class-member? ( obj class -- ? ) 2drop t ;
  
-M: integer class-member? ( obj class -- ? ) 2drop f ;
+M: integer class-member? ( obj class -- ? ) = ;
  
-M: character-class-range class-member? ( obj class -- ? )
+M: range class-member? ( obj class -- ? )
      [ from>> ] [ to>> ] bi between? ;
  
  M: any-char class-member? ( obj class -- ? )
@@ -47,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? )
  M: digit-class class-member? ( obj class -- ? )
      drop digit? ;
  
+: c-identifier-char? ( ch -- ? )
+    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
+
  M: c-identifier-class class-member? ( obj class -- ? )
-    drop
-    { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
+    drop c-identifier-char? ;
  
  M: alpha-class class-member? ( obj class -- ? )
      drop alpha? ;
  
+: punct? ( ch -- ? )
+    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
+
  M: punctuation-class class-member? ( obj class -- ? )
      drop punct? ;
  
+: java-printable? ( ch -- ? )
+    { [ alpha? ] [ punct? ] } 1|| ;
+
  M: java-printable-class class-member? ( obj class -- ? )
      drop java-printable? ;
  
@@ -64,11 +73,24 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
      drop { [ blank? ] [ CHAR: \n = not ] } 1&& ;
  
  M: control-character-class class-member? ( obj class -- ? )
-    drop control-char? ;
+    drop control? ;
+
+: hex-digit? ( ch -- ? )
+    {
+        [ CHAR: A CHAR: F between? ]
+        [ CHAR: a CHAR: f between? ]
+        [ CHAR: 0 CHAR: 9 between? ]
+    } 1|| ;
  
  M: hex-digit-class class-member? ( obj class -- ? )
      drop hex-digit? ;
  
+: java-blank? ( ch -- ? )
+    {
+        CHAR: \s CHAR: \t CHAR: \n
+        HEX: b HEX: 7 CHAR: \r
+    } member? ;
+
  M: java-blank-class class-member? ( obj class -- ? )
      drop java-blank? ;
  
@@ -76,16 +98,106 @@ M: unmatchable-class class-member? ( obj class -- ? )
      2drop f ;
  
  M: terminator-class class-member? ( obj class -- ? )
-    drop {
-        [ CHAR: \r = ]
-        [ CHAR: \n = ]
-        [ CHAR: \u000085 = ]
-        [ CHAR: \u002028 = ]
-        [ CHAR: \u002029 = ]
-    } 1|| ;
+    drop "\r\n\u000085\u002029\u002028" member? ;
  
  M: beginning-of-line class-member? ( obj class -- ? )
      2drop f ;
  
  M: end-of-line class-member? ( obj class -- ? )
      2drop f ;
+
+M: f class-member? 2drop f ;
+
+TUPLE: primitive-class class ;
+C: <primitive-class> primitive-class
+
+TUPLE: or-class seq ;
+
+TUPLE: not-class class ;
+
+TUPLE: and-class seq ;
+
+m:GENERIC: combine-and ( class1 class2 -- combined ? )
+
+: replace-if-= ( object object -- object ? )
+    over = ;
+
+m:METHOD: combine-and { object object } replace-if-= ;
+
+m:METHOD: combine-and { integer integer }
+    2dup = [ drop t ] [ 2drop f t ] if ;
+
+m:METHOD: combine-and { t object }
+    nip t ;
+
+m:METHOD: combine-and { f object }
+    drop t ;
+
+m:METHOD: combine-and { not-class object }
+    [ class>> ] dip = [ f t ] [ f f ] if ;
+
+m:METHOD: combine-and { integer object }
+    2dup class-member? [ drop t ] [ 2drop f t ] if ;
+
+m:GENERIC: combine-or ( class1 class2 -- combined ? )
+
+m:METHOD: combine-or { object object } replace-if-= ;
+
+m:METHOD: combine-or { integer integer }
+    2dup = [ drop t ] [ 2drop f f ] if ;
+
+m:METHOD: combine-or { t object }
+    drop t ;
+
+m:METHOD: combine-or { f object }
+    nip t ;
+
+m:METHOD: combine-or { not-class object }
+    [ class>> ] dip = [ t t ] [ f f ] if ;
+
+m:METHOD: combine-or { integer object }
+    2dup class-member? [ nip t ] [ 2drop f f ] if ;
+
+: try-combine ( elt1 elt2 quot -- combined/f ? )
+    3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline
+
+:: prefix-combining ( seq elt quot: ( elt1 elt2 -- combined/f ? ) -- newseq )
+    f :> combined!
+    seq [ elt quot try-combine swap combined! ] find drop
+    [ seq remove-nth combined prefix ]
+    [ seq elt prefix ] if* ; inline
+
+:: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq )
+    seq { } [ quot prefix-combining ] reduce
+    dup length {
+        { 0 [ drop empty ] }
+        { 1 [ first ] }
+        [ drop class new swap >>seq ]
+    } case ; inline
+
+: <and-class> ( seq -- class )
+    [ combine-and ] t and-class combine ;
+
+M: and-class class-member?
+    seq>> [ class-member? ] with all? ;
+
+: <or-class> ( seq -- class )
+    [ combine-or ] f or-class combine ;
+
+M: or-class class-member?
+    seq>> [ class-member? ] with any? ;
+
+: <not-class> ( class -- inverse )
+    {
+        { t [ f ] }
+        { f [ t ] }
+        [ dup not-class? [ class>> ] [ not-class boa ] if ]
+    } case ;
+
+M: not-class class-member?
+    class>> class-member? not ;
+
+M: primitive-class class-member?
+    class>> class-member? ;
+
+UNION: class primitive-class not-class or-class range ;
diff --git a/basis/regexp/dfa/dfa-tests.factor b/basis/regexp/dfa/dfa-tests.factor

new file mode 100644 (file)

index 0000000..b6ce13c
--- /dev/null
+++ b/basis/regexp/dfa/dfa-tests.factor
@@ -0,0 +1,5 @@
+USING: regexp.dfa tools.test ;
+IN: regexp.dfa.tests
+
+[ [ ] [ ] while-changes ] must-infer
+
diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor

index 549669cab727328eabd5fd6244d247fb52495160..8c2e99516381f1f108546fb24dbd611ab4e32759 100644 (file)
--- a/basis/regexp/dfa/dfa.factor
+++ b/basis/regexp/dfa/dfa.factor
@@ -1,84 +1,79 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors arrays assocs combinators fry kernel locals
  math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors regexp.utils sequences.deep ;
-USING: io prettyprint threads ;
+sets sorting vectors ;
  IN: regexp.dfa
  
-: find-delta ( states transition regexp -- new-states )
-    nfa-table>> transitions>>
-    rot [ swap at at ] with with gather sift ;
+:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
+    obj quot call :> new-obj
+    new-obj comp call :> new-key
+    new-key old-key =
+    [ new-obj ]
+    [ new-obj quot comp new-key (while-changes) ]
+    if ; inline recursive
  
-: (find-epsilon-closure) ( states regexp -- new-states )
+: while-changes ( obj quot pred -- obj' )
+    3dup nip call (while-changes) ; inline
+
+: find-delta ( states transition nfa -- new-states )
+    transitions>> '[ _ swap _ at at ] gather sift ;
+
+: (find-epsilon-closure) ( states nfa -- new-states )
      eps swap find-delta ;
  
-: find-epsilon-closure ( states regexp -- new-states )
+: find-epsilon-closure ( states nfa -- new-states )
      '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
      natural-sort ;
  
-: find-closure ( states transition regexp -- new-states )
-    [ find-delta ] 2keep nip find-epsilon-closure ;
+: find-closure ( states transition nfa -- new-states )
+    [ find-delta ] keep find-epsilon-closure ;
  
-: find-start-state ( regexp -- state )
-    [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
+: find-start-state ( nfa -- state )
+    [ start-state>> 1vector ] keep find-epsilon-closure ;
  
-: find-transitions ( seq1 regexp -- seq2 )
-    nfa-table>> transitions>>
-    [ at keys ] curry gather
+: find-transitions ( dfa-state nfa -- next-dfa-state )
+    transitions>>
+    '[ _ at keys ] gather
      eps swap remove ;
  
-: add-todo-state ( state regexp -- )
-    2dup visited-states>> key? [
-        2drop
-    ] [
-        [ visited-states>> conjoin ]
-        [ new-states>> push ] 2bi
+: add-todo-state ( state visited-states new-states -- )
+    3dup drop key? [ 3drop ] [
+        [ conjoin ] [ push ] bi-curry* bi
      ] if ;
  
-: new-transitions ( regexp -- )
-    dup new-states>> [
-        drop
-    ] [
-        dupd pop dup pick find-transitions rot
-        [
-            [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
-            [ swapd transition make-transition ] dip
-            dfa-table>> add-transition 
-        ] curry with each
-        new-transitions
+:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
+    new-states [ nfa dfa ] [
+        pop :> state
+        state nfa find-transitions
+        [| trans |
+            state trans nfa find-closure :> new-state
+            new-state visited-states new-states add-todo-state
+            state new-state trans transition make-transition dfa add-transition
+        ] each
+        nfa dfa new-states visited-states new-transitions
      ] if-empty ;
  
  : states ( hashtable -- array )
      [ keys ]
-    [ values [ values concat ] map concat append ] bi ;
+    [ values [ values concat ] map concat ] bi
+    append ;
  
-: set-final-states ( regexp -- )
-    dup
-    [ nfa-table>> final-states>> keys ]
-    [ dfa-table>> transitions>> states ] bi
-    [ intersects? ] with filter
-
-    swap dfa-table>> final-states>>
+: set-final-states ( nfa dfa -- )
+    [
+        [ final-states>> keys ]
+        [ transitions>> states ] bi*
+        [ intersects? ] with filter
+    ] [ final-states>> ] bi
      [ conjoin ] curry each ;
  
-: set-initial-state ( regexp -- )
-    dup
-    [ dfa-table>> ] [ find-start-state ] bi
-    [ >>start-state drop ] keep
-    1vector >>new-states drop ;
-
-: set-traversal-flags ( regexp -- )
-    dup
-    [ nfa-traversal-flags>> ]
-    [ dfa-table>> transitions>> keys ] bi
-    [ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
-    >>dfa-traversal-flags drop ;
+: initialize-dfa ( nfa -- dfa )
+    <transition-table>
+        swap find-start-state >>start-state ;
  
-: construct-dfa ( regexp -- )
-    {
-        [ set-initial-state ]
-        [ new-transitions ]
-        [ set-final-states ]
-        [ set-traversal-flags ]
-    } cleave ;
+: construct-dfa ( nfa -- dfa )
+    dup initialize-dfa
+    dup start-state>> 1vector
+    H{ } clone
+    new-transitions
+    [ set-final-states ] keep ;
diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor

new file mode 100644 (file)

index 0000000..0b63351
--- /dev/null
+++ b/basis/regexp/disambiguate/disambiguate.factor
@@ -0,0 +1,39 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel accessors regexp.classes math.bits assocs sequences
+arrays sets regexp.dfa math fry regexp.minimize ;
+IN: regexp.disambiguate
+
+TUPLE: parts in out ;
+
+: make-partition ( choices classes -- partition )
+    zip [ first ] partition [ values ] bi@ parts boa ;
+
+: powerset-partition ( classes -- partitions )
+    [ length [ 2^ ] keep ] keep '[
+        _ <bits> _ make-partition
+    ] map rest ;
+
+: partition>class ( parts -- class )
+    [ out>> [ <not-class> ] map ]
+    [ in>> <and-class> ] bi
+    prefix <and-class> ;
+
+: get-transitions ( partition state-transitions -- next-states )
+    [ in>> ] dip '[ _ at ] map prune ;
+
+: disambiguate ( dfa -- nfa )  
+    [
+        [
+            [ keys powerset-partition ] keep '[
+                [ partition>class ]
+                [ _ get-transitions ] bi
+            ] H{ } map>assoc
+            [ drop ] assoc-filter 
+        ] assoc-map
+    ] change-transitions ;
+
+: nfa>dfa ( nfa -- dfa )
+    construct-dfa
+    minimize disambiguate
+    construct-dfa minimize ;
diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor

new file mode 100644 (file)

index 0000000..78a90ca
--- /dev/null
+++ b/basis/regexp/minimize/minimize-tests.factor
@@ -0,0 +1,48 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.minimize assocs regexp accessors regexp.transition-tables ;
+IN: regexp.minimize.tests
+
+[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test
+[ t ] [ 2 1 H{ { { 1 2 } t } } same-partition? ] unit-test
+[ f ] [ 2 3 H{ { { 1 2 } t } } same-partition? ] unit-test
+
+[ H{ { 1 1 } { 2 1 } { 3 3 } { 4 3 } } ]
+[ { { 1 1 } { 1 2 } { 2 2 } { 3 3 } { 3 4 } { 4 4 } } [ t ] H{ } map>assoc partition>classes ] unit-test
+
+[ { { 1 2 } { 3 4 } } ] [ H{ { "elephant" 1 } { "tiger" 3 } } H{ { "elephant" 2 } { "tiger" 4 } } assemble-values ] unit-test
+
+[ 3 ] [ R/ ab|ac/ dfa>> transitions>> assoc-size ] unit-test
+[ 3 ] [ R/ a(b|c)/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ ((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ a|((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test
+[ 2 ] [ R/ ab|((aa*)*)*b/ dfa>> transitions>> assoc-size ] unit-test
+[ 4 ] [ R/ ab|cd/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ [a-z]*|[A-Z]*/i dfa>> transitions>> assoc-size ] unit-test
+
+[
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { CHAR: b 1 } } }
+            { 1 H{ { CHAR: a 2 } { CHAR: b 2 } } }
+            { 2 H{ { CHAR: c 3 } } }
+            { 3 H{ } }
+        } }
+        { start-state 0 }
+        { final-states H{ { 3 3 } } }
+    }
+] [ 
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { CHAR: b 4 } } }
+            { 1 H{ { CHAR: a 2 } { CHAR: b 5 } } }
+            { 2 H{ { CHAR: c 3 } } }
+            { 3 H{ } }
+            { 4 H{ { CHAR: a 2 } { CHAR: b 5 } } }
+            { 5 H{ { CHAR: c 6 } } }
+            { 6 H{ } }
+        } }
+        { start-state 0 }
+        { final-states H{ { 3 3 } { 6 6 } } }
+    } combine-states
+] unit-test
diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor

new file mode 100644 (file)

index 0000000..163e87f
--- /dev/null
+++ b/basis/regexp/minimize/minimize.factor
@@ -0,0 +1,90 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel sequences regexp.transition-tables fry assocs
+accessors locals math sorting arrays sets hashtables regexp.dfa
+combinators.short-circuit ;
+IN: regexp.minimize
+
+: number-transitions ( transitions numbering -- new-transitions )
+    dup '[
+        [ _ at ]
+        [ [ first _ at ] assoc-map ] bi*
+    ] assoc-map ;
+
+: table>state-numbers ( table -- assoc )
+    transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
+
+: map-set ( assoc quot -- new-assoc )
+    '[ drop @ dup ] assoc-map ; inline
+
+: rewrite-transitions ( transition-table assoc quot -- transition-table )
+    [
+        [ '[ _ at ] change-start-state ]
+        [ '[ [ _ at ] map-set ] change-final-states ]
+        [ ] tri
+    ] dip '[ _ @ ] change-transitions ; inline
+
+: number-states ( table -- newtable )
+    dup table>state-numbers
+    [ number-transitions ] rewrite-transitions ;
+
+: initially-same? ( s1 s2 transition-table -- ? )
+    {
+        [ drop <= ]
+        [ transitions>> '[ _ at keys ] bi@ set= ]
+        [ final-states>> '[ _ key? ] bi@ = ]
+    } 3&& ;
+
+:: initialize-partitions ( transition-table -- partitions )
+    ! Partition table is sorted-array => ?
+    H{ } clone :> out
+    transition-table transitions>> keys :> states
+    states [| s1 |
+        states [| s2 |
+            s1 s2 transition-table initially-same?
+            [ s1 s2 2array out conjoin ] when
+        ] each
+    ] each out ;
+
+: same-partition? ( s1 s2 partitions -- ? )
+    [ 2array natural-sort ] dip key? ;
+
+: assemble-values ( assoc1 assoc2 -- values )
+    dup keys '[ _ swap [ at ] curry map ] bi@ zip ;
+
+: stay-same? ( s1 s2 transition partitions -- ? )
+    [ '[ _ transitions>> at ] bi@ assemble-values ] dip
+    '[ _ same-partition? ] assoc-all? ;
+
+: partition-more ( partitions transition-table -- partitions )
+    over '[ drop first2 _ _ stay-same? ] assoc-filter ;
+
+: partition>classes ( partitions -- synonyms ) ! old-state => new-state
+    >alist sort-keys
+    [ drop first2 swap ] assoc-map
+    <reversed>
+    >hashtable ;
+
+: state-classes ( transition-table -- synonyms )
+    [ initialize-partitions ] keep
+    '[ _ partition-more ] [ assoc-size ] while-changes
+    partition>classes ;
+
+: canonical-state? ( state state-classes -- ? )
+    dupd at = ;
+
+: delete-duplicates ( transitions state-classes -- new-transitions )
+    '[ drop _ canonical-state? ] assoc-filter ;
+
+: rewrite-duplicates ( new-transitions state-classes -- new-transitions )
+    '[ [ _ at ] assoc-map ] assoc-map ;
+
+: combine-transitions ( transitions state-classes -- new-transitions )
+    [ delete-duplicates ] [ rewrite-duplicates ] bi ;
+
+: combine-states ( table -- smaller-table )
+    dup state-classes
+    [ combine-transitions ] rewrite-transitions ;
+
+: minimize ( table -- minimal-table )
+    clone number-states combine-states ;
diff --git a/basis/regexp/negation/negation-tests.factor b/basis/regexp/negation/negation-tests.factor

new file mode 100644 (file)

index 0000000..41dfe7f
--- /dev/null
+++ b/basis/regexp/negation/negation-tests.factor
@@ -0,0 +1,27 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
+IN: regexp.negation.tests
+
+[
+    ! R/ |[^a]|.+/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { T{ not-class f CHAR: a } -1 } } }
+            { 1 H{ { t -1 } } }
+            { -1 H{ { t -1 } } }
+        } } 
+        { start-state 0 }
+        { final-states H{ { 0 0 } { -1 -1 } } }
+    }
+] [
+    ! R/ a/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } } }
+            { 1 H{ } } 
+        } }
+        { start-state 0 }
+        { final-states H{ { 1 1 } } }
+    } negate-table
+] unit-test
diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor

new file mode 100644 (file)

index 0000000..f5a43a2
--- /dev/null
+++ b/basis/regexp/negation/negation.factor
@@ -0,0 +1,61 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.nfa regexp.disambiguate kernel sequences
+assocs regexp.classes hashtables accessors fry vectors
+regexp.ast regexp.transition-tables regexp.minimize ;
+IN: regexp.negation
+
+: ast>dfa ( parse-tree -- minimal-dfa )
+    construct-nfa nfa>dfa ;
+
+CONSTANT: fail-state -1
+
+: add-default-transition ( state's-transitions -- new-state's-transitions )
+    clone dup
+    [ [ fail-state ] dip keys [ <not-class> ] map <and-class> ] keep set-at ;
+
+: fail-state-recurses ( transitions -- new-transitions )
+    clone dup
+    [ fail-state t associate fail-state ] dip set-at ;
+
+: add-fail-state ( transitions -- new-transitions )
+    [ add-default-transition ] assoc-map
+    fail-state-recurses ;
+
+: assoc>set ( assoc -- keys-set )
+    [ drop dup ] assoc-map ;
+
+: inverse-final-states ( transition-table -- final-states )
+    [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
+
+: negate-table ( transition-table -- transition-table )
+    clone
+        [ add-fail-state ] change-transitions
+        dup inverse-final-states >>final-states ;
+
+: renumber-transitions ( transitions numbering -- new-transitions )
+    dup '[
+        [ _ at ]
+        [ [ [ _ at ] map ] assoc-map ] bi*
+    ] assoc-map ;
+
+: renumber-states ( transition-table -- transition-table )
+    dup transitions>> keys [ next-state ] H{ } map>assoc
+    [ renumber-transitions ] rewrite-transitions ;
+
+: box-transitions ( transition-table -- transition-table )
+    [ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ;
+
+: unify-final-state ( transition-table -- transition-table )
+    dup [ final-states>> keys ] keep
+    '[ -2 eps <literal-transition> _ add-transition ] each
+    H{ { -2 -2 } } >>final-states ;
+
+: adjoin-dfa ( transition-table -- start end )
+    box-transitions unify-final-state renumber-states
+    [ start-state>> ]
+    [ final-states>> keys first ]
+    [ table [ transitions>> ] bi@ swap update ] tri ;
+
+M: negation nfa-node ( node -- start end )
+    term>> ast>dfa negate-table adjoin-dfa ;
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor

index 537c85c2d3b20acfd305a3903bab4b27a3a08667..72ce880f8bcc31e8458a82fe71a9ac98f11237e8 100644 (file)
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -1,235 +1,160 @@
  ! Copyright (C) 2008 Doug Coleman.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs grouping kernel regexp.backend
-locals math namespaces regexp.parser sequences fry quotations
-math.order math.ranges vectors unicode.categories regexp.utils
-regexp.transition-tables words sets regexp.classes unicode.case.private ;
+USING: accessors arrays assocs grouping kernel
+locals math namespaces sequences fry quotations
+math.order math.ranges vectors unicode.categories
+regexp.transition-tables words sets hashtables combinators.short-circuit
+unicode.case.private regexp.ast regexp.classes ;
+IN: regexp.nfa
+
  ! This uses unicode.case.private for ch>upper and ch>lower
  ! but case-insensitive matching should be done by case-folding everything
  ! before processing starts
-IN: regexp.nfa
  
-ERROR: feature-is-broken feature ;
+GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
+! This is unfinished and does nothing right now!
+
+M: object remove-lookahead ;
  
-SYMBOL: negation-mode
-: negated? ( -- ? ) negation-mode get 0 or odd? ; 
+M: with-options remove-lookahead
+    [ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
+
+M: alternation remove-lookahead
+    [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ alternation boa ;
+
+M: concatenation remove-lookahead ;
  
  SINGLETON: eps
  
-MIXIN: traversal-flag
-SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
-SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
-SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
-SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
-SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
-SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
-SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
-SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
-SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
-
-: options ( -- obj ) current-regexp get options>> ;
-
-: option? ( obj -- ? ) options key? ;
-
-: option-on ( obj -- ) options conjoin ;
-
-: option-off ( obj -- ) options delete-at ;
-
-: next-state ( regexp -- state )
-    [ state>> ] [ [ 1+ ] change-state drop ] bi ;
-
-: set-start-state ( regexp -- )
-    dup stack>> [
-        drop
-    ] [
-        [ nfa-table>> ] [ pop first ] bi* >>start-state drop
-    ] if-empty ;
-
-GENERIC: nfa-node ( node -- )
-
-:: add-simple-entry ( obj class -- )
-    [let* | regexp [ current-regexp get ]
-            s0 [ regexp next-state ]
-            s1 [ regexp next-state ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ] |
-        negated? [
-            s0 f obj class make-transition table add-transition
-            s0 s1 <default-transition> table add-transition
-        ] [
-            s0 s1 obj class make-transition table add-transition
-        ] if
-        s0 s1 2array stack push
-        t s1 table final-states>> set-at ] ;
-
-: add-traversal-flag ( flag -- )
-    stack peek second
-    current-regexp get nfa-traversal-flags>> push-at ;
-
-:: concatenate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ] |
-        s1 s2 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s0 s3 2array stack push ] ;
-
-:: alternate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s4 [ regexp next-state ]
-            s5 [ regexp next-state ] |
-        s4 s0 eps <literal-transition> table add-transition
-        s4 s2 eps <literal-transition> table add-transition
-        s1 s5 eps <literal-transition> table add-transition
-        s3 s5 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s3 table final-states>> delete-at
-        t s5 table final-states>> set-at
-        s4 s5 2array stack push ] ;
-
-M: kleene-star nfa-node ( node -- )
-    term>> nfa-node
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s2 [ regexp next-state ]
-            s3 [ regexp next-state ]
-            table [ regexp nfa-table>> ] |
-        s1 table final-states>> delete-at
-        t s3 table final-states>> set-at
-        s1 s0 eps <literal-transition> table add-transition
-        s2 s0 eps <literal-transition> table add-transition
-        s2 s3 eps <literal-transition> table add-transition
-        s1 s3 eps <literal-transition> table add-transition
-        s2 s3 2array stack push ] ;
-
-M: concatenation nfa-node ( node -- )
-    seq>>
-    reversed-regexp option? [ <reversed> ] when
-    [ [ nfa-node ] each ]
-    [ length 1- [ concatenate-nodes ] times ] bi ;
-
-M: alternation nfa-node ( node -- )
-    seq>>
-    [ [ nfa-node ] each ]
-    [ length 1- [ alternate-nodes ] times ] bi ;
-
-M: constant nfa-node ( node -- )
-    case-insensitive option? [
-        dup char>> [ ch>lower ] [ ch>upper ] bi
-        2dup = [
-            2drop
-            char>> literal-transition add-simple-entry
-        ] [
-            [ literal-transition add-simple-entry ] bi@
-            alternate-nodes drop
-        ] if
-    ] [
-        char>> literal-transition add-simple-entry
-    ] if ;
-
-M: epsilon nfa-node ( node -- )
+SYMBOL: option-stack
+
+SYMBOL: state
+
+: next-state ( -- state )
+    state [ get ] [ inc ] bi ;
+
+SYMBOL: nfa-table
+: table ( -- table ) nfa-table get ;
+
+: set-each ( keys value hashtable -- )
+    '[ _ swap _ set-at ] each ;
+
+: options>hash ( options -- hashtable )
+    H{ } clone [
+        [ [ on>> t ] dip set-each ]
+        [ [ off>> f ] dip set-each ] 2bi
+    ] keep ;
+
+: using-options ( options quot -- )
+    [ options>hash option-stack [ ?push ] change ] dip
+    call option-stack get pop* ; inline
+
+: option? ( obj -- ? )
+    option-stack get assoc-stack ;
+
+GENERIC: nfa-node ( node -- start-state end-state )
+
+: add-simple-entry ( obj class -- start-state end-state )
+    [ next-state next-state 2dup ] 2dip
+    make-transition table add-transition ;
+
+: epsilon-transition ( source target -- )
+    eps <literal-transition> table add-transition ;
+
+M:: star nfa-node ( node -- start end )
+    node term>> nfa-node :> s1 :> s0
+    next-state :> s2
+    next-state :> s3
+    s1 s0 epsilon-transition
+    s2 s0 epsilon-transition
+    s2 s3 epsilon-transition
+    s1 s3 epsilon-transition
+    s2 s3 ;
+
+M: epsilon nfa-node
      drop eps literal-transition add-simple-entry ;
  
-M: word nfa-node ( node -- ) class-transition add-simple-entry ;
+M: concatenation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    reversed-regexp option? [ swap ] when
+    [ nfa-node ] bi@
+    [ epsilon-transition ] dip ;
+
+:: alternate-nodes ( s0 s1 s2 s3 -- start end )
+    next-state :> s4
+    next-state :> s5
+    s4 s0 epsilon-transition
+    s4 s2 epsilon-transition
+    s1 s5 epsilon-transition
+    s3 s5 epsilon-transition
+    s4 s5 ;
+
+M: alternation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    [ nfa-node ] bi@
+    alternate-nodes ;
  
-M: any-char nfa-node ( node -- )
-    [ dotall option? ] dip any-char-no-nl ?
-    class-transition add-simple-entry ;
+GENERIC: modify-class ( char-class -- char-class' )
  
-! M: beginning-of-text nfa-node ( node -- ) ;
+M: object modify-class ;
  
-M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: integer modify-class
+    case-insensitive option? [
+        dup Letter? [
+            [ ch>lower ] [ ch>upper ] bi 2array <or-class>
+        ] when
+    ] when ;
+
+M: integer nfa-node ( node -- start end )
+    modify-class dup class?
+    class-transition literal-transition ?
+    add-simple-entry ;
+
+M: primitive-class modify-class
+    class>> modify-class <primitive-class> ;
+
+M: or-class modify-class
+    seq>> [ modify-class ] map <or-class> ;
  
-M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: not-class modify-class
+    class>> modify-class <not-class> ;
  
-: choose-letter-class ( node -- node' )
-    case-insensitive option? Letter-class rot ? ;
+M: any-char modify-class
+    drop dotall option? t any-char-no-nl ? ;
  
-M: letter-class nfa-node ( node -- )
-    choose-letter-class class-transition add-simple-entry ;
+: modify-letter-class ( class -- newclass )
+    case-insensitive option? [ drop Letter-class ] when ;
+M: letter-class modify-class modify-letter-class ;
+M: LETTER-class modify-class modify-letter-class ;
  
-M: LETTER-class nfa-node ( node -- )
-    choose-letter-class class-transition add-simple-entry ;
+: cased-range? ( range -- ? )
+    [ from>> ] [ to>> ] bi {
+        [ [ letter? ] bi@ and ]
+        [ [ LETTER? ] bi@ and ]
+    } 2|| ;
  
-M: character-class-range nfa-node ( node -- )
+M: range modify-class
      case-insensitive option? [
-        ! This should be implemented for Unicode by case-folding
-        ! the input and all strings in the regexp.
-        dup [ from>> ] [ to>> ] bi
-        2dup [ Letter? ] bi@ and [
-            rot drop
-            [ [ ch>lower ] bi@ character-class-range boa ]
-            [ [ ch>upper ] bi@ character-class-range boa ] 2bi 
-            [ class-transition add-simple-entry ] bi@
-            alternate-nodes
-        ] [
-            2drop
-            class-transition add-simple-entry
-        ] if
-    ] [
-        class-transition add-simple-entry
-    ] if ;
-
-M: capture-group nfa-node ( node -- )
-    "capture-groups" feature-is-broken
-    eps literal-transition add-simple-entry
-    capture-group-on add-traversal-flag
-    term>> nfa-node
-    eps literal-transition add-simple-entry
-    capture-group-off add-traversal-flag
-    2 [ concatenate-nodes ] times ;
-
-! xyzzy
-M: non-capture-group nfa-node ( node -- )
-    term>> nfa-node ;
-
-M: reluctant-kleene-star nfa-node ( node -- )
-    term>> <kleene-star> nfa-node ;
-
-M: negation nfa-node ( node -- )
-    negation-mode inc
-    term>> nfa-node 
-    negation-mode dec ;
-
-M: lookahead nfa-node ( node -- )
-    "lookahead" feature-is-broken
-    eps literal-transition add-simple-entry
-    lookahead-on add-traversal-flag
-    term>> nfa-node
-    eps literal-transition add-simple-entry
-    lookahead-off add-traversal-flag
-    2 [ concatenate-nodes ] times ;
-
-M: lookbehind nfa-node ( node -- )
-    "lookbehind" feature-is-broken
-    eps literal-transition add-simple-entry
-    lookbehind-on add-traversal-flag
-    term>> nfa-node
-    eps literal-transition add-simple-entry
-    lookbehind-off add-traversal-flag
-    2 [ concatenate-nodes ] times ;
-
-M: option nfa-node ( node -- )
-    [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
-    eps literal-transition add-simple-entry ;
-
-: construct-nfa ( regexp -- )
+        dup cased-range? [
+            [ from>> ] [ to>> ] bi
+            [ [ ch>lower ] bi@ <range> ]
+            [ [ ch>upper ] bi@ <range> ] 2bi 
+            2array <or-class>
+        ] when
+    ] when ;
+
+M: class nfa-node
+    modify-class class-transition add-simple-entry ;
+
+M: with-options nfa-node ( node -- start end )
+    dup options>> [ tree>> nfa-node ] using-options ;
+
+: construct-nfa ( ast -- nfa-table )
      [
-        reset-regexp
-        negation-mode off
-        [ current-regexp set ]
-        [ parse-tree>> nfa-node ]
-        [ set-start-state ] tri
+        0 state set
+        <transition-table> nfa-table set
+        remove-lookahead nfa-node
+        table
+            swap dup associate >>final-states
+            swap >>start-state
      ] with-scope ;
diff --git a/basis/regexp/parser/parser-tests.factor b/basis/regexp/parser/parser-tests.factor

index fe4d2f1d1a877d141c679519b22a8eb4e58df88e..d606015f617e19e5e3a181174e0425df838593c1 100644 (file)
--- a/basis/regexp/parser/parser-tests.factor
+++ b/basis/regexp/parser/parser-tests.factor
@@ -1,34 +1,24 @@
-USING: kernel tools.test regexp.backend regexp ;
-IN: regexp.parser
+USING: kernel tools.test regexp.parser fry sequences ;
+IN: regexp.parser.tests
  
-: test-regexp ( string -- )
-    default-regexp parse-regexp ;
+: regexp-parses ( string -- )
+    [ [ ] ] dip '[ _ parse-regexp drop ] unit-test ;
  
-! [ "(" ] [ unmatched-parentheses? ] must-fail-with
+: regexp-fails ( string -- )
+    '[ _ parse-regexp ] must-fail ;
  
-[ ] [ "a|b" test-regexp ] unit-test
-[ ] [ "a.b" test-regexp ] unit-test
-[ ] [ "a|b|c" test-regexp ] unit-test
-[ ] [ "abc|b" test-regexp ] unit-test
-[ ] [ "a|bcd" test-regexp ] unit-test
-[ ] [ "a|(b)" test-regexp ] unit-test
-[ ] [ "(a)|b" test-regexp ] unit-test
-[ ] [ "(a|b)" test-regexp ] unit-test
-[ ] [ "((a)|(b))" test-regexp ] unit-test
+{
+    "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
+    "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
+    "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
+    "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
+    "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
+    "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
+    "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
+    "\\ueeee" "\\0333" "\\xff" "\\\\" "\\w"
+} [ regexp-parses ] each
  
-[ ] [ "(?:a)" test-regexp ] unit-test
-[ ] [ "(?i:a)" test-regexp ] unit-test
-[ ] [ "(?-i:a)" test-regexp ] unit-test
-[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
-[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
-
-[ ] [ "(?=a)" test-regexp ] unit-test
-
-[ ] [ "[abc]" test-regexp ] unit-test
-[ ] [ "[a-c]" test-regexp ] unit-test
-[ ] [ "[^a-c]" test-regexp ] unit-test
-[ "[^]" test-regexp ] must-fail
-
-[ ] [ "|b" test-regexp ] unit-test
-[ ] [ "b|" test-regexp ] unit-test
-[ ] [ "||" test-regexp ] unit-test
+{
+    "[^]" "[]" "a{foo}" "a{,}" "a{}" "(?)" "\\p{foo}" "\\P{foo}"
+    "\\ueeeg" "\\0339" "\\xfg"
+} [ regexp-fails ] each
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index 377535eccd1aac074ac4b39bbfc18472c860bcc5..56c6b1eb04cd53bc096094ec2f124cebdb44b519 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -1,437 +1,168 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs combinators io io.streams.string
-kernel math math.parser namespaces sets
-quotations sequences splitting vectors math.order
-strings regexp.backend regexp.utils
-unicode.case unicode.categories words locals regexp.classes ;
+USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
+combinators regexp.classes strings splitting peg locals accessors
+regexp.ast ;
  IN: regexp.parser
  
-FROM: math.ranges => [a,b] ;
-
-TUPLE: concatenation seq ; INSTANCE: concatenation node
-TUPLE: alternation seq ; INSTANCE: alternation node
-TUPLE: kleene-star term ; INSTANCE: kleene-star node
-
-! !!!!!!!!
-TUPLE: possessive-question term ; INSTANCE: possessive-question node
-TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
-
-! !!!!!!!!
-TUPLE: reluctant-question term ; INSTANCE: reluctant-question node
-TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
-
-TUPLE: negation term ; INSTANCE: negation node
-TUPLE: constant char ; INSTANCE: constant node
-TUPLE: range from to ; INSTANCE: range node
-
-MIXIN: parentheses-group
-TUPLE: lookahead term ; INSTANCE: lookahead node
-INSTANCE: lookahead parentheses-group
-TUPLE: lookbehind term ; INSTANCE: lookbehind node
-INSTANCE: lookbehind parentheses-group
-TUPLE: capture-group term ; INSTANCE: capture-group node
-INSTANCE: capture-group parentheses-group
-TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
-INSTANCE: non-capture-group parentheses-group
-TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
-INSTANCE: independent-group parentheses-group
-TUPLE: comment-group term ; INSTANCE: comment-group node
-INSTANCE: comment-group parentheses-group
-
-SINGLETON: epsilon INSTANCE: epsilon node
-
-TUPLE: option option on? ; INSTANCE: option node
-
-SINGLETONS: unix-lines dotall multiline comments case-insensitive
-unicode-case reversed-regexp ;
-
-SINGLETONS: beginning-of-character-class end-of-character-class
-left-parenthesis pipe caret dash ;
-
-: push1 ( obj -- ) input-stream get stream>> push ;
-: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
-: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
-: drop1 ( -- ) read1 drop ;
-
-: stack ( -- obj ) current-regexp get stack>> ;
-: change-whole-stack ( quot -- )
-    current-regexp get
-    [ stack>> swap call ] keep (>>stack) ; inline
-: push-stack ( obj -- ) stack push ;
-: pop-stack ( -- obj ) stack pop ;
-: cut-out ( vector n -- vector' vector ) cut rest ;
-ERROR: cut-stack-error ;
-: cut-stack ( obj vector -- vector' vector )
-    [ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ;
-
-: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
-: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
-: <possessive-question> ( obj -- kleene ) possessive-question boa ;
-: <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
-
-: <negation> ( obj -- negation ) negation boa ;
-: <concatenation> ( seq -- concatenation )
-    >vector [ epsilon ] [ concatenation boa ] if-empty ;
-: <alternation> ( seq -- alternation ) >vector alternation boa ;
-: <capture-group> ( obj -- capture-group ) capture-group boa ;
-: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
-: <constant> ( obj -- constant ) constant boa ;
-
-: first|concatenation ( seq -- first/concatenation )
-    dup length 1 = [ first ] [ <concatenation> ] if ;
-
-: first|alternation ( seq -- first/alternation )
-    dup length 1 = [ first ] [ <alternation> ] if ;
-
-: <character-class-range> ( from to -- obj )
-    2dup <
-    [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
-
-ERROR: unmatched-parentheses ;
-
-ERROR: unknown-regexp-option option ;
+: allowed-char? ( ch -- ? )
+    ".()|[*+?" member? not ;
  
-: ch>option ( ch -- singleton )
+ERROR: bad-number ;
+
+: ensure-number ( n -- n )
+    [ bad-number ] unless* ;
+
+:: at-error ( key assoc quot: ( key -- replacement ) -- value )
+    key assoc at* [ drop key quot call ] unless ; inline
+
+ERROR: bad-class name ;
+
+: name>class ( name -- class )
+    {
+        { "Lower" letter-class }
+        { "Upper" LETTER-class }
+        { "Alpha" Letter-class }
+        { "ASCII" ascii-class }
+        { "Digit" digit-class }
+        { "Alnum" alpha-class }
+        { "Punct" punctuation-class }
+        { "Graph" java-printable-class }
+        { "Print" java-printable-class }
+        { "Blank" non-newline-blank-class }
+        { "Cntrl" control-character-class }
+        { "XDigit" hex-digit-class }
+        { "Space" java-blank-class }
+        ! TODO: unicode-character-class
+    } [ bad-class ] at-error ;
+
+: lookup-escape ( char -- ast )
      {
-        { CHAR: i [ case-insensitive ] }
-        { CHAR: d [ unix-lines ] }
-        { CHAR: m [ multiline ] }
-        { CHAR: n [ multiline ] }
-        { CHAR: r [ reversed-regexp ] }
-        { CHAR: s [ dotall ] }
-        { CHAR: u [ unicode-case ] }
-        { CHAR: x [ comments ] }
-        [ unknown-regexp-option ]
+        { CHAR: t [ CHAR: \t ] }
+        { CHAR: n [ CHAR: \n ] }
+        { CHAR: r [ CHAR: \r ] }
+        { CHAR: f [ HEX: c ] }
+        { CHAR: a [ HEX: 7 ] }
+        { CHAR: e [ HEX: 1b ] }
+        { CHAR: \\ [ CHAR: \\ ] }
+
+        { CHAR: w [ c-identifier-class <primitive-class> ] }
+        { CHAR: W [ c-identifier-class <primitive-class> <not-class> ] }
+        { CHAR: s [ java-blank-class <primitive-class> ] }
+        { CHAR: S [ java-blank-class <primitive-class> <not-class> ] }
+        { CHAR: d [ digit-class <primitive-class> ] }
+        { CHAR: D [ digit-class <primitive-class> <not-class> ] }
+
+        [ ]
      } case ;
  
+: options-assoc ( -- assoc )
+    H{
+        { CHAR: i case-insensitive }
+        { CHAR: d unix-lines }
+        { CHAR: m multiline }
+        { CHAR: n multiline }
+        { CHAR: r reversed-regexp }
+        { CHAR: s dotall }
+        { CHAR: u unicode-case }
+        { CHAR: x comments }
+    } ;
+
+: ch>option ( ch -- singleton )
+    options-assoc at ;
+
  : option>ch ( option -- string )
-    {
-        { case-insensitive [ CHAR: i ] }
-        { multiline [ CHAR: m ] }
-        { reversed-regexp [ CHAR: r ] }
-        { dotall [ CHAR: s ] }
-        [ unknown-regexp-option ]
-    } case ;
+    options-assoc value-at ;
  
-: toggle-option ( ch ? -- ) 
-    [ ch>option ] dip option boa push-stack ;
-
-: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
-
-: parse-options ( string -- )
-    "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
-
-ERROR: bad-special-group string ;
-
-DEFER: (parse-regexp)
-: nested-parse-regexp ( token ? -- )
-    [ push-stack (parse-regexp) pop-stack ] dip
-    [ <negation> ] when pop-stack new swap >>term push-stack ;
-
-! non-capturing groups
-: (parse-special-group) ( -- )
-    read1 {
-        { [ dup CHAR: # = ] ! comment
-            [ drop comment-group f nested-parse-regexp pop-stack drop ] }
-        { [ dup CHAR: : = ]
-            [ drop non-capture-group f nested-parse-regexp ] }
-        { [ dup CHAR: = = ]
-            [ drop lookahead f nested-parse-regexp ] }
-        { [ dup CHAR: ! = ]
-            [ drop lookahead t nested-parse-regexp ] }
-        { [ dup CHAR: > = ]
-            [ drop non-capture-group f nested-parse-regexp ] }
-        { [ dup CHAR: < = peek1 CHAR: = = and ]
-            [ drop drop1 lookbehind f nested-parse-regexp ] }
-        { [ dup CHAR: < = peek1 CHAR: ! = and ]
-            [ drop drop1 lookbehind t nested-parse-regexp ] }
-        [
-            ":)" read-until
-            [ swap prefix ] dip
-            {
-                { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
-                { CHAR: ) [ parse-options ] }
-                [ drop bad-special-group ]
-            } case
-        ]
-    } cond ;
-
-: handle-left-parenthesis ( -- )
-    peek1 CHAR: ? =
-    [ drop1 (parse-special-group) ]
-    [ capture-group f nested-parse-regexp ] if ;
-
-: handle-dot ( -- ) any-char push-stack ;
-: handle-pipe ( -- ) pipe push-stack ;
-: (handle-star) ( obj -- kleene-star )
-    peek1 {
-        { CHAR: + [ drop1 <possessive-kleene-star> ] }
-        { CHAR: ? [ drop1 <reluctant-kleene-star> ] }
-        [ drop <kleene-star> ]
-    } case ;
-: handle-star ( -- ) stack pop (handle-star) push-stack ;
-: handle-question ( -- )
-    stack pop peek1 {
-        { CHAR: + [ drop1 <possessive-question> ] }
-        { CHAR: ? [ drop1 <reluctant-question> ] }
-        [ drop epsilon 2array <alternation> ]
-    } case push-stack ;
-: handle-plus ( -- )
-    stack pop dup (handle-star)
-    2array <concatenation> push-stack ;
-
-ERROR: unmatched-brace ;
-: parse-repetition ( -- start finish ? )
-    "}" read-until [ unmatched-brace ] unless
-    [ "," split1 [ string>number ] bi@ ]
-    [ CHAR: , swap index >boolean ] bi ;
-
-: replicate/concatenate ( n obj -- obj' )
-    over zero? [ 2drop epsilon ]
-    [ <repetition> first|concatenation ] if ;
-
-: exactly-n ( n -- )
-    stack pop replicate/concatenate push-stack ;
-
-: at-least-n ( n -- )
-    stack pop
-    [ replicate/concatenate ] keep
-    <kleene-star> 2array <concatenation> push-stack ;
-
-: at-most-n ( n -- )
-    1+
-    stack pop
-    [ replicate/concatenate ] curry map <alternation> push-stack ;
-
-: from-m-to-n ( m n -- )
-    [a,b]
-    stack pop
-    [ replicate/concatenate ] curry map
-    <alternation> push-stack ;
-
-ERROR: invalid-range a b ;
-
-: handle-left-brace ( -- )
-    parse-repetition
-    [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
-    [
-        2dup and [ from-m-to-n ]
-        [ [ nip at-most-n ] [ at-least-n ] if* ] if
-    ] [ drop 0 max exactly-n ] if ;
-
-: handle-front-anchor ( -- ) beginning-of-line push-stack ;
-: handle-back-anchor ( -- ) end-of-line push-stack ;
-
-ERROR: bad-character-class obj ;
-ERROR: expected-posix-class ;
-
-: parse-posix-class ( -- obj )
-    read1 CHAR: { = [ expected-posix-class ] unless
-    "}" read-until [ bad-character-class ] unless
-    {
-        { "Lower" [ letter-class ] }
-        { "Upper" [ LETTER-class ] }
-        { "Alpha" [ Letter-class ] }
-        { "ASCII" [ ascii-class ] }
-        { "Digit" [ digit-class ] }
-        { "Alnum" [ alpha-class ] }
-        { "Punct" [ punctuation-class ] }
-        { "Graph" [ java-printable-class ] }
-        { "Print" [ java-printable-class ] }
-        { "Blank" [ non-newline-blank-class ] }
-        { "Cntrl" [ control-character-class ] }
-        { "XDigit" [ hex-digit-class ] }
-        { "Space" [ java-blank-class ] }
-        ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
-        [ bad-character-class ]
-    } case ;
+: parse-options ( on off -- options )
+    [ [ ch>option ] { } map-as ] bi@ <options> ;
  
-: parse-octal ( -- n ) 3 read oct> check-octal ;
-: parse-short-hex ( -- n ) 2 read hex> check-hex ;
-: parse-long-hex ( -- n ) 6 read hex> check-hex ;
-: parse-control-character ( -- n ) read1 ;
+: string>options ( string -- options )
+    "-" split1 parse-options ;
+ 
+: options>string ( options -- string )
+    [ on>> ] [ off>> ] bi
+    [ [ option>ch ] map ] bi@
+    [ "-" glue ] unless-empty
+    "" like ;
  
-ERROR: bad-escaped-literals seq ;
+! TODO: add syntax for various parenthized things,
+!       add greedy and nongreedy forms of matching
+! (once it's all implemented)
  
-: parse-til-E ( -- obj )
-    "\\E" read-until [ bad-escaped-literals ] unless ;
-    
-:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
-    parse-til-E
-    drop1
-    [ epsilon ] [
-        quot call [ <constant> ] V{ } map-as
-        first|concatenation
-    ] if-empty ; inline
+EBNF: parse-regexp
  
-: parse-escaped-literals ( -- obj )
-    [ ] (parse-escaped-literals) ;
+CharacterInBracket = !("}") Character
  
-: lower-case-literals ( -- obj )
-    [ >lower ] (parse-escaped-literals) ;
+QuotedCharacter = !("\\E") .
  
-: upper-case-literals ( -- obj )
-    [ >upper ] (parse-escaped-literals) ;
+Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
+       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
+       | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
+       | "u" Character:a Character:b Character:c Character:d
+            => [[ { a b c d } hex> ensure-number ]]
+       | "x" Character:a Character:b
+            => [[ { a b } hex> ensure-number ]]
+       | "0" Character:a Character:b Character:c
+            => [[ { a b c } oct> ensure-number ]]
+       | . => [[ lookup-escape ]]
  
-: parse-escaped ( -- obj )
-    read1
-    {
-        { CHAR: t [ CHAR: \t <constant> ] }
-        { CHAR: n [ CHAR: \n <constant> ] }
-        { CHAR: r [ CHAR: \r <constant> ] }
-        { CHAR: f [ HEX: c <constant> ] }
-        { CHAR: a [ HEX: 7 <constant> ] }
-        { CHAR: e [ HEX: 1b <constant> ] }
-
-        { CHAR: w [ c-identifier-class ] }
-        { CHAR: W [ c-identifier-class <negation> ] }
-        { CHAR: s [ java-blank-class ] }
-        { CHAR: S [ java-blank-class <negation> ] }
-        { CHAR: d [ digit-class ] }
-        { CHAR: D [ digit-class <negation> ] }
-
-        { CHAR: p [ parse-posix-class ] }
-        { CHAR: P [ parse-posix-class <negation> ] }
-        { CHAR: x [ parse-short-hex <constant> ] }
-        { CHAR: u [ parse-long-hex <constant> ] }
-        { CHAR: 0 [ parse-octal <constant> ] }
-        { CHAR: c [ parse-control-character ] }
-
-        { CHAR: Q [ parse-escaped-literals ] }
-
-        ! { CHAR: b [ word-boundary-class ] }
-        ! { CHAR: B [ word-boundary-class <negation> ] }
-        ! { CHAR: A [ handle-beginning-of-input ] }
-        ! { CHAR: z [ handle-end-of-input ] }
-
-        ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
-
-        ! m//g mode
-        ! { CHAR: G [ end of previous match ] }
-
-        ! Group capture
-        ! { CHAR: 1 [ CHAR: 1 <constant> ] }
-        ! { CHAR: 2 [ CHAR: 2 <constant> ] }
-        ! { CHAR: 3 [ CHAR: 3 <constant> ] }
-        ! { CHAR: 4 [ CHAR: 4 <constant> ] }
-        ! { CHAR: 5 [ CHAR: 5 <constant> ] }
-        ! { CHAR: 6 [ CHAR: 6 <constant> ] }
-        ! { CHAR: 7 [ CHAR: 7 <constant> ] }
-        ! { CHAR: 8 [ CHAR: 8 <constant> ] }
-        ! { CHAR: 9 [ CHAR: 9 <constant> ] }
-
-        ! Perl extensions
-        ! can't do \l and \u because \u is already a 4-hex
-        { CHAR: L [ lower-case-literals ] }
-        { CHAR: U [ upper-case-literals ] }
-
-        [ <constant> ]
-    } case ;
+EscapeSequence = "\\" Escape:e => [[ e ]]
  
-: handle-escape ( -- ) parse-escaped push-stack ;
-
-: handle-dash ( vector -- vector' )
-    H{ { dash CHAR: - } } substitute ;
-
-: character-class>alternation ( seq -- alternation )
-    [ dup number? [ <constant> ] when ] map first|alternation ;
-
-: handle-caret ( vector -- vector' )
-    dup [ length 2 >= ] [ first caret eq? ] bi and [
-        rest-slice character-class>alternation <negation>
-    ] [
-        character-class>alternation
-    ] if ;
-
-: make-character-class ( -- character-class )
-    [ beginning-of-character-class swap cut-stack ] change-whole-stack
-    handle-dash handle-caret ;
-
-: apply-dash ( -- )
-    stack [ pop3 nip <character-class-range> ] keep push ;
-
-: apply-dash? ( -- ? )
-    stack dup length 3 >=
-    [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
-
-ERROR: empty-negated-character-class ;
-DEFER: handle-left-bracket
-: (parse-character-class) ( -- )
-    read1 [ empty-negated-character-class ] unless* {
-        { CHAR: [ [ handle-left-bracket t ] }
-        { CHAR: ] [ make-character-class push-stack f ] }
-        { CHAR: - [ dash push-stack t ] }
-        { CHAR: \ [ parse-escaped push-stack t ] }
-        [ push-stack apply-dash? [ apply-dash ] when t ]
-    } case
-    [ (parse-character-class) ] when ;
-
-: push-constant ( ch -- ) <constant> push-stack ;
-
-: parse-character-class-second ( -- )
-    read1 {
-        { CHAR: [ [ CHAR: [ push-constant ] }
-        { CHAR: ] [ CHAR: ] push-constant ] }
-        { CHAR: - [ CHAR: - push-constant ] }
-        [ push1 ]
-    } case ;
+Character = EscapeSequence | . ?[ allowed-char? ]?
  
-: parse-character-class-first ( -- )
-    read1 {
-        { CHAR: ^ [ caret push-stack parse-character-class-second ] }
-        { CHAR: [ [ CHAR: [ push-constant ] }
-        { CHAR: ] [ CHAR: ] push-constant ] }
-        { CHAR: - [ CHAR: - push-constant ] }
-        [ push1 ]
-    } case ;
+AnyRangeCharacter = EscapeSequence | .
  
-: handle-left-bracket ( -- )
-    beginning-of-character-class push-stack
-    parse-character-class-first (parse-character-class) ;
+RangeCharacter = !("]") AnyRangeCharacter
  
-: finish-regexp-parse ( stack -- obj )
-    { pipe } split
-    [ first|concatenation ] map first|alternation ;
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+      | RangeCharacter
  
-: handle-right-parenthesis ( -- )
-    stack dup [ parentheses-group "members" word-prop member? ] find-last
-    -rot cut rest
-    [ [ push ] keep current-regexp get (>>stack) ]
-    [ finish-regexp-parse push-stack ] bi* ;
+StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+           | AnyRangeCharacter
  
-: parse-regexp-token ( token -- ? )
-    {
-        { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning?
-        { CHAR: ) [ handle-right-parenthesis f ] }
-        { CHAR: . [ handle-dot t ] }
-        { CHAR: | [ handle-pipe t ] }
-        { CHAR: ? [ handle-question t ] }
-        { CHAR: * [ handle-star t ] }
-        { CHAR: + [ handle-plus t ] }
-        { CHAR: { [ handle-left-brace t ] }
-        { CHAR: [ [ handle-left-bracket t ] }
-        { CHAR: \ [ handle-escape t ] }
-        [
-            dup CHAR: $ = peek1 f = and
-            [ drop handle-back-anchor f ]
-            [ push-constant t ] if
-        ]
-    } case ;
+Ranges = StartRange:s Range*:r => [[ r s prefix ]]
+
+CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+
+Options = [idmsux]*
+
+Parenthized = "?:" Alternation:a => [[ a ]]
+            | "?" Options:on "-"? Options:off ":" Alternation:a
+                => [[ a on off parse-options <with-options> ]]
+            | "?#" [^)]* => [[ f ]]
+            | "?~" Alternation:a => [[ a <negation> ]]
+            | "?=" Alternation:a => [[ a <lookahead> ]]
+            | "?!" Alternation:a => [[ a <negation> <lookahead> ]]
+            | "?<=" Alternation:a => [[ a <lookbehind> ]]
+            | "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
+            | Alternation
+
+Element = "(" Parenthized:p ")" => [[ p ]]
+        | "[" CharClass:r "]" => [[ r ]]
+        | ".":d => [[ any-char <primitive-class> ]]
+        | Character
+
+Number = (!(","|"}").)* => [[ string>number ensure-number ]]
+
+Times = "," Number:n "}" => [[ 0 n <from-to> ]]
+      | Number:n ",}" => [[ n <at-least> ]]
+      | Number:n "}" => [[ n n <from-to> ]]
+      | "}" => [[ bad-number ]]
+      | Number:n "," Number:m "}" => [[ n m <from-to> ]]
+
+Repeated = Element:e "{" Times:t => [[ e t <times> ]]
+         | Element:e "?" => [[ e <maybe> ]]
+         | Element:e "*" => [[ e <star> ]]
+         | Element:e "+" => [[ e <plus> ]]
+         | Element
+
+Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
+
+Alternation = Concatenation:c ("|" Concatenation)*:a
+                => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
+
+End = !(.)
  
-: (parse-regexp) ( -- )
-    read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
-
-: parse-regexp-beginning ( -- )
-    peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
-
-: parse-regexp ( regexp -- )
-    dup current-regexp [
-        raw>> [
-            <string-reader> [
-                parse-regexp-beginning (parse-regexp)
-            ] with-input-stream
-        ] unless-empty
-        current-regexp get [ finish-regexp-parse ] change-stack
-        dup stack>> >>parse-tree drop
-    ] with-variable ;
+Main = Alternation End
+;EBNF
diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor

index 378ae503ce7257ce331f1b412a1b05121b2c6d1f..1dc2a22d8176ad3abb7143b1f200e64c77e41d26 100644 (file)
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -1,6 +1,6 @@
  ! Copyright (C) 2008 Doug Coleman.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel strings help.markup help.syntax regexp.backend ;
+USING: kernel strings help.markup help.syntax ;
  IN: regexp
  
  HELP: <regexp>
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index 1cd9a2392efc87e1646eb52b17ec24fda88b67e1..0d9ed129c8546b6427bd8b875b8e392ce3ab25ba 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -44,9 +44,9 @@ IN: regexp-tests
  ! Dotall mode -- when on, . matches newlines.
  ! Off by default.
  [ f ] [ "\n" "." <regexp> matches? ] unit-test
-[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
+! [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
  [ t ] [ "\n" R/ ./s matches? ] unit-test
-[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
+! [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
  
  [ f ] [ "" ".+" <regexp> matches? ] unit-test
  [ t ] [ "a" ".+" <regexp> matches? ] unit-test
@@ -76,8 +76,6 @@ IN: regexp-tests
  [ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
  [ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
  
-/*
-! FIXME
  [ f ] [ "" "(a)" <regexp> matches? ] unit-test
  [ t ] [ "a" "(a)" <regexp> matches? ] unit-test
  [ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
@@ -85,7 +83,6 @@ IN: regexp-tests
  
  [ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
  [ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
-*/
  
  [ f ] [ "" "a{1}" <regexp> matches? ] unit-test
  [ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
@@ -168,12 +165,9 @@ IN: regexp-tests
  [ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
  [ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
  
-/*
-! FIXME
  [ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
  [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
  [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
-*/
  
  [ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
  [ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
@@ -195,8 +189,8 @@ IN: regexp-tests
  [ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
  [ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
  [ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
-[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
-[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
+[ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
+[ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
  
  [ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
  [ f ] [ "b" "a+b" <regexp> matches? ] unit-test
@@ -226,6 +220,7 @@ IN: regexp-tests
  [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
  [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
  
+/*
  [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
  [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
  [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
@@ -235,6 +230,7 @@ IN: regexp-tests
  [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
  [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
  [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
+*/
  
  [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
  [ t ] [ "A" R/ [a-z]/i matches? ] unit-test
@@ -253,8 +249,6 @@ IN: regexp-tests
  [ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test
  [ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test
  
-/*
-! FIXME
  [ ] [
      "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
      <regexp> drop
@@ -278,7 +272,6 @@ IN: regexp-tests
  [ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
  
  [ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test
-*/
  
  ! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
  
@@ -309,7 +302,6 @@ IN: regexp-tests
  [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
  
  /*
-! FIXME
  [ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
  [ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
  [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
@@ -325,15 +317,21 @@ IN: regexp-tests
  ! Bug in parsing word
  [ t ] [ "a" R' a' matches? ] unit-test
  
-! Convert to lowercase until E
-[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
-[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
+! Testing negation
+[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a)/ matches? ] unit-test
  
-! Convert to uppercase until E
-[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
-[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
+[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
  
-! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
+! Intersecting classes
+[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
  
  ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
  ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
@@ -378,10 +376,10 @@ IN: regexp-tests
  ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
  ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
  
-! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
  
  ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
  ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor

index 86f978373b54fe31f42b08e4c0cb8f690e988bfa..189d430d85950a3adbee180733ce5de3996b5a4d 100644 (file)
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@@ -2,40 +2,31 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors combinators kernel math sequences strings sets
  assocs prettyprint.backend prettyprint.custom make lexer
-namespaces parser arrays fry regexp.backend regexp.utils
+namespaces parser arrays fry locals regexp.minimize
  regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting ;
+regexp.transition-tables splitting sorting regexp.ast
+regexp.negation ;
  IN: regexp
  
-: default-regexp ( string -- regexp )
-    regexp new
-        swap >>raw
-        <transition-table> >>nfa-table
-        <transition-table> >>dfa-table
-        <transition-table> >>minimized-table
-        H{ } clone >>nfa-traversal-flags
-        H{ } clone >>dfa-traversal-flags
-        H{ } clone >>options
-        H{ } clone >>matchers
-        reset-regexp ;
-
-: construct-regexp ( regexp -- regexp' )
-    {
-        [ parse-regexp ]
-        [ construct-nfa ]
-        [ construct-dfa ]
-        [ ]
-    } cleave ;
+TUPLE: regexp raw parse-tree options dfa ;
+
+: <optioned-regexp> ( string options -- regexp )
+    [ dup parse-regexp ] [ string>options ] bi*
+    2dup <with-options> ast>dfa
+    regexp boa ;
+
+: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
+
+<PRIVATE
  
  : (match) ( string regexp -- dfa-traverser )
-    <dfa-traverser> do-match ; inline
+    dfa>> <dfa-traverser> do-match ; inline
+
+PRIVATE>
  
  : match ( string regexp -- slice/f )
      (match) return-match ;
  
-: match* ( string regexp -- slice/f captured-groups )
-    (match) [ return-match ] [ captured-groups>> ] bi ;
-
  : matches? ( string regexp -- ? )
      dupd match
      [ [ length ] bi@ = ] [ drop f ] if* ;
@@ -61,9 +52,13 @@ IN: regexp
      dupd first-match
      [ split1-slice swap ] [ "" like f swap ] if* ;
  
+<PRIVATE
+
  : (re-split) ( string regexp -- )
      over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
  
+PRIVATE>
+
  : re-split ( string regexp -- seq )
      [ (re-split) ] { } make ;
  
@@ -97,22 +92,6 @@ IN: regexp
          { "R| "  "|"  }
      } swap [ subseq? not nip ] curry assoc-find drop ;
  
-: string>options ( string -- options )
-    [ ch>option dup ] H{ } map>assoc ;
-
-: options>string ( options -- string )
-    keys [ option>ch ] map natural-sort >string ;
-
-PRIVATE>
-
-: <optioned-regexp> ( string option-string -- regexp )
-    [ default-regexp ] [ string>options ] bi* >>options
-    construct-regexp ;
-
-: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
-
-<PRIVATE
-
  : parsing-regexp ( accum end -- accum )
      lexer get dup skip-blank
      [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor

index e5c31a54e0e40f4260e439030410069e36b99bc2..c02ebce91f03c728340f96b32c6b8c2bdc6c5bab 100644 (file)
--- a/basis/regexp/transition-tables/transition-tables.factor
+++ b/basis/regexp/transition-tables/transition-tables.factor
@@ -1,7 +1,7 @@
  ! Copyright (C) 2008 Doug Coleman.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors arrays assocs fry hashtables kernel sequences
-vectors regexp.utils ;
+vectors ;
  IN: regexp.transition-tables
  
  TUPLE: transition from to obj ;
@@ -41,8 +41,8 @@ TUPLE: transition-table transitions start-state final-states ;
      #! set the state as a key
      2dup [ to>> ] dip maybe-initialize-key
      [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
-    2dup at* [ 2nip insert-at ]
-    [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
+    2dup at* [ 2nip push-at ]
+    [ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ;
  
  : add-transition ( transition transition-table -- )
      transitions>> set-transition ;
diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor

index 104a6c2ce1c2159445e2ba8175d55520e5e295b1..5d48353f56f2b2142c73ae452ac2904ba210925b 100644 (file)
--- a/basis/regexp/traversal/traversal.factor
+++ b/basis/regexp/traversal/traversal.factor
@@ -1,40 +1,25 @@
  ! Copyright (C) 2008 Doug Coleman.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors assocs combinators kernel math
-quotations sequences regexp.parser regexp.classes fry arrays
-combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
+quotations sequences regexp.classes fry arrays
+combinators.short-circuit prettyprint regexp.nfa ;
  IN: regexp.traversal
  
  TUPLE: dfa-traverser
      dfa-table
-    traversal-flags
-    traverse-forward
-    lookahead-counters
-    lookbehind-counters
-    capture-counters
-    captured-groups
-    capture-group-index
-    last-state current-state
+    current-state
      text
      match-failed?
      start-index current-index
      matches ;
  
-: <dfa-traverser> ( text regexp -- match )
-    [ dfa-table>> ] [ dfa-traversal-flags>> ] bi
+: <dfa-traverser> ( text dfa -- match )
      dfa-traverser new
-        swap >>traversal-flags
          swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
          swap >>text
-        t >>traverse-forward
          0 >>start-index
          0 >>current-index
-        0 >>capture-group-index
-        V{ } clone >>matches
-        V{ } clone >>capture-counters
-        V{ } clone >>lookbehind-counters
-        V{ } clone >>lookahead-counters
-        H{ } clone >>captured-groups ;
+        V{ } clone >>matches ;
  
  : final-state? ( dfa-traverser -- ? )
      [ current-state>> ]
@@ -48,7 +33,7 @@ TUPLE: dfa-traverser
  
  : text-finished? ( dfa-traverser -- ? )
      {
-        [ current-state>> empty? ]
+        [ current-state>> not ]
          [ end-of-text? ]
          [ match-failed?>> ]
      } 1|| ;
@@ -61,111 +46,27 @@ TUPLE: dfa-traverser
          dup save-final-state
      ] when text-finished? ;
  
+: text-character ( dfa-traverser n -- ch )
+    [ text>> ] swap '[ current-index>> _ + ] bi nth ;
+
  : previous-text-character ( dfa-traverser -- ch )
-    [ text>> ] [ current-index>> 1- ] bi nth ;
+    -1 text-character ;
  
  : current-text-character ( dfa-traverser -- ch )
-    [ text>> ] [ current-index>> ] bi nth ;
+    0 text-character ;
  
  : next-text-character ( dfa-traverser -- ch )
-    [ text>> ] [ current-index>> 1+ ] bi nth ;
-
-GENERIC: flag-action ( dfa-traverser flag -- )
-
-
-M: beginning-of-input flag-action ( dfa-traverser flag -- )
-    drop
-    dup beginning-of-text? [ t >>match-failed? ] unless drop ;
-
-M: end-of-input flag-action ( dfa-traverser flag -- )
-    drop
-    dup end-of-text? [ t >>match-failed? ] unless drop ;
-
-
-M: beginning-of-line flag-action ( dfa-traverser flag -- )
-    drop
-    dup {
-        [ beginning-of-text? ]
-        [ previous-text-character terminator-class class-member? ]
-    } 1|| [ t >>match-failed? ] unless drop ;
-
-M: end-of-line flag-action ( dfa-traverser flag -- )
-    drop
-    dup {
-        [ end-of-text? ]
-        [ next-text-character terminator-class class-member? ]
-    } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: word-boundary flag-action ( dfa-traverser flag -- )
-    drop
-    dup {
-        [ end-of-text? ]
-        [ current-text-character terminator-class class-member? ]
-    } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: lookahead-on flag-action ( dfa-traverser flag -- )
-    drop
-    lookahead-counters>> 0 swap push ;
-
-M: lookahead-off flag-action ( dfa-traverser flag -- )
-    drop
-    dup lookahead-counters>>
-    [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
-
-M: lookbehind-on flag-action ( dfa-traverser flag -- )
-    drop
-    f >>traverse-forward
-    [ 2 - ] change-current-index
-    lookbehind-counters>> 0 swap push ;
-
-M: lookbehind-off flag-action ( dfa-traverser flag -- )
-    drop
-    t >>traverse-forward
-    dup lookbehind-counters>>
-    [ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ;
-
-M: capture-group-on flag-action ( dfa-traverser flag -- )
-    drop
-    [ current-index>> 0 2array ]
-    [ capture-counters>> ] bi push ;
-
-M: capture-group-off flag-action ( dfa-traverser flag -- )
-    drop
-    dup capture-counters>> empty? [
-        drop
-    ] [
-        {
-            [ capture-counters>> pop first2 dupd + ]
-            [ text>> <slice> ]
-            [ [ 1+ ] change-capture-group-index capture-group-index>> ]
-            [ captured-groups>> set-at ]
-        } cleave
-    ] if ;
-
-: process-flags ( dfa-traverser -- )
-    [ [ 1+ ] map ] change-lookahead-counters
-    [ [ 1+ ] map ] change-lookbehind-counters
-    [ [ first2 1+ 2array ] map ] change-capture-counters
-    ! dup current-state>> .
-    dup [ current-state>> ] [ traversal-flags>> ] bi
-    at [ flag-action ] with each ;
+    1 text-character ;
  
  : increment-state ( dfa-traverser state -- dfa-traverser )
-    [
-        dup traverse-forward>>
-        [ [ 1+ ] change-current-index ]
-        [ [ 1- ] change-current-index ] if
-        dup current-state>> >>last-state
-    ] [ first ] bi* >>current-state ;
+    [ [ 1 + ] change-current-index ] dip >>current-state ;
  
  : match-literal ( transition from-state table -- to-state/f )
      transitions>> at at ;
  
  : match-class ( transition from-state table -- to-state/f )
      transitions>> at* [
-        [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if
+        swap '[ drop _ swap class-member? ] assoc-find spin ?
      ] [ drop ] if ;
  
  : match-default ( transition from-state table -- to-state/f )
@@ -180,7 +81,6 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
      [ dfa-table>> ] tri ;
  
  : do-match ( dfa-traverser -- dfa-traverser )
-    dup process-flags
      dup match-done? [
          dup setup-match match-transition
          [ increment-state do-match ] when*
diff --git a/basis/regexp/utils/utils-tests.factor b/basis/regexp/utils/utils-tests.factor

deleted file mode 100644 (file)

index d048ad4..0000000
--- a/basis/regexp/utils/utils-tests.factor
+++ /dev/null
@@ -1,4 +0,0 @@
-USING: regexp.utils tools.test ;
-IN: regexp.utils.tests
-
-[ [ ] [ ] while-changes ] must-infer
diff --git a/basis/regexp/utils/utils.factor b/basis/regexp/utils/utils.factor

deleted file mode 100644 (file)

index af1b2fa..0000000
--- a/basis/regexp/utils/utils.factor
+++ /dev/null
@@ -1,64 +0,0 @@
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs io kernel math math.order
-namespaces regexp.backend sequences unicode.categories
-math.ranges fry combinators.short-circuit vectors ;
-IN: regexp.utils
-
-: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
-    [ [ dup slip ] dip pick over call ] dip dupd =
-    [ 3drop ] [ (while-changes) ] if ; inline recursive
-
-: while-changes ( obj quot pred -- obj' )
-    pick over call (while-changes) ; inline
-
-: assoc-with ( param assoc quot -- assoc curry )
-    swapd [ [ -rot ] dip call ] 2curry ; inline
-
-: insert-at ( value key hash -- )
-    2dup at* [
-        2nip push
-    ] [
-        drop
-        [ dup vector? [ 1vector ] unless ] 2dip set-at
-    ] if ;
-
-: ?insert-at ( value key hash/f -- hash )
-    [ H{ } clone ] unless* [ insert-at ] keep ;
-
-ERROR: bad-octal number ;
-ERROR: bad-hex number ;
-: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
-: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
-
-: ascii? ( n -- ? ) 0 HEX: 7f between? ;
-: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
-: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
-
-: hex-digit? ( n -- ? )
-    [
-        [ decimal-digit? ]
-        [ CHAR: a CHAR: f between? ]
-        [ CHAR: A CHAR: F between? ]
-    ] 1|| ;
-
-: control-char? ( n -- ? )
-    [
-        [ 0 HEX: 1f between? ]
-        [ HEX: 7f = ]
-    ] 1|| ;
-
-: punct? ( n -- ? )
-    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
-
-: c-identifier-char? ( ch -- ? )
-    [ [ alpha? ] [ CHAR: _ = ] ] 1|| ;
-
-: java-blank? ( n -- ? )
-    {
-        CHAR: \s CHAR: \t CHAR: \n
-        HEX: b HEX: 7 CHAR: \r
-    } member? ;
-
-: java-printable? ( n -- ? )
-    [ [ alpha? ] [ punct? ] ] 1|| ;
diff --git a/basis/xmode/catalog/catalog.factor b/basis/xmode/catalog/catalog.factor

index 7a935d31a424b6619f9219a5302c30fe29784d92..894ec264abb4ed02eed51f130aea3fcdc5686194 100644 (file)
--- a/basis/xmode/catalog/catalog.factor
+++ b/basis/xmode/catalog/catalog.factor
@@ -1,13 +1,14 @@
  USING: xmode.loader xmode.utilities xmode.rules namespaces
  strings splitting assocs sequences kernel io.files xml memoize
-words globs combinators io.encodings.utf8 sorting accessors xml.data ;
+words globs combinators io.encodings.utf8 sorting accessors xml.data
+xml.traversal xml.syntax ;
  IN: xmode.catalog
  
  TUPLE: mode file file-name-glob first-line-glob ;
  
-<TAGS: parse-mode-tag ( modes tag -- )
+TAGS: parse-mode-tag ( modes tag -- )
  
-TAG: MODE
+TAG: MODE parse-mode-tag
      dup "NAME" attr [
          mode new {
              { "FILE" f (>>file) }
@@ -17,11 +18,9 @@ TAG: MODE
      ] dip
      rot set-at ;
  
-TAGS>
-
  : parse-modes-tag ( tag -- modes )
      H{ } clone [
-        swap child-tags [ parse-mode-tag ] with each
+        swap children-tags [ parse-mode-tag ] with each
      ] keep ;
  
  MEMO: modes ( -- modes )
@@ -97,8 +96,8 @@ ERROR: mutually-recursive-rulesets ruleset ;
      ] if ;
  
  : finalize-mode ( rulesets -- )
-    rule-sets [
-        dup [ nip finalize-rule-set ] assoc-each
+    dup rule-sets [
+        [ nip finalize-rule-set ] assoc-each
      ] with-variable ;
  
  : load-mode ( name -- rule-sets )
diff --git a/basis/xmode/loader/loader.factor b/basis/xmode/loader/loader.factor

index ef1defc4da55f7ce27962fe6833152a8d46878f3..e5d5112a275b45c406d5c4261612b686c887f187 100644 (file)
--- a/basis/xmode/loader/loader.factor
+++ b/basis/xmode/loader/loader.factor
@@ -1,56 +1,54 @@
  USING: xmode.loader.syntax xmode.tokens xmode.rules
  xmode.keyword-map xml.data xml.traversal xml assocs kernel
  combinators sequences math.parser namespaces parser
-xmode.utilities parser-combinators.regexp io.files accessors ;
+xmode.utilities regexp io.files accessors xml.syntax ;
  IN: xmode.loader
  
  ! Based on org.gjt.sp.jedit.XModeHandler
  
  ! RULES and its children
-<TAGS: parse-rule-tag ( rule-set tag -- )
+TAGS: parse-rule-tag ( rule-set tag -- )
  
-TAG: PROPS
+TAG: PROPS parse-rule-tag
      parse-props-tag >>props drop ;
  
-TAG: IMPORT
+TAG: IMPORT parse-rule-tag
      "DELEGATE" attr swap import-rule-set ;
  
-TAG: TERMINATE
+TAG: TERMINATE parse-rule-tag
      "AT_CHAR" attr string>number >>terminate-char drop ;
  
-RULE: SEQ seq-rule
+RULE: SEQ seq-rule parse-rule-tag
      shared-tag-attrs delegate-attr literal-start ;
  
-RULE: SEQ_REGEXP seq-rule
+RULE: SEQ_REGEXP seq-rule parse-rule-tag
      shared-tag-attrs delegate-attr regexp-attr regexp-start ;
  
-RULE: SPAN span-rule
+RULE: SPAN span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr span-attrs parse-begin/end-tags init-span-tag ;
  
-RULE: SPAN_REGEXP span-rule
+RULE: SPAN_REGEXP span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr span-attrs regexp-attr parse-begin/end-tags init-span-tag ;
  
-RULE: EOL_SPAN eol-span-rule
+RULE: EOL_SPAN eol-span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr literal-start init-eol-span-tag ;
  
-RULE: EOL_SPAN_REGEXP eol-span-rule
+RULE: EOL_SPAN_REGEXP eol-span-rule parse-rule-tag
      shared-tag-attrs delegate-attr match-type-attr regexp-attr regexp-start init-eol-span-tag ;
  
-RULE: MARK_FOLLOWING mark-following-rule
+RULE: MARK_FOLLOWING mark-following-rule parse-rule-tag
      shared-tag-attrs match-type-attr literal-start ;
  
-RULE: MARK_PREVIOUS mark-previous-rule
+RULE: MARK_PREVIOUS mark-previous-rule parse-rule-tag
      shared-tag-attrs match-type-attr literal-start ;
  
-TAG: KEYWORDS ( rule-set tag -- key value )
+TAG: KEYWORDS parse-rule-tag
      rule-set get ignore-case?>> <keyword-map>
-    swap child-tags [ over parse-keyword-tag ] each
+    swap children-tags [ over parse-keyword-tag ] each
      swap (>>keywords) ;
  
-TAGS>
-
  : ?<regexp> ( string/f -- regexp/f )
-    dup [ rule-set get ignore-case?>> <regexp> ] when ;
+    dup [ rule-set get ignore-case?>> <?insensitive-regexp> ] when ;
  
  : (parse-rules-tag) ( tag -- rule-set )
      <rule-set> dup rule-set set
@@ -66,7 +64,7 @@ TAGS>
  
  : parse-rules-tag ( tag -- rule-set )
      [
-        [ (parse-rules-tag) ] [ child-tags ] bi
+        [ (parse-rules-tag) ] [ children-tags ] bi
          [ parse-rule-tag ] with each
          rule-set get
      ] with-scope ;
diff --git a/basis/xmode/loader/syntax/syntax.factor b/basis/xmode/loader/syntax/syntax.factor

index 0e7293da976f54d16fe4222a658580a736cbe570..60318e669e7fea9cffb97649a07d07c21a2236d7 100644 (file)
--- a/basis/xmode/loader/syntax/syntax.factor
+++ b/basis/xmode/loader/syntax/syntax.factor
@@ -3,7 +3,7 @@
  USING: accessors xmode.tokens xmode.rules xmode.keyword-map
  xml.data xml.traversal xml assocs kernel combinators sequences
  math.parser namespaces make parser lexer xmode.utilities
-parser-combinators.regexp io.files splitting arrays ;
+regexp io.files splitting arrays xml.syntax xml.syntax.private ;
  IN: xmode.loader.syntax
  
  ! Rule tag parsing utilities
@@ -11,9 +11,10 @@ IN: xmode.loader.syntax
      new swap init-from-tag swap add-rule ; inline
  
  : RULE:
-    scan scan-word
-    parse-definition { } make
-    swap [ (parse-rule-tag) ] 2curry (TAG:) ; parsing
+    scan scan-word scan-word [
+        parse-definition { } make
+        swap [ (parse-rule-tag) ] 2curry
+    ] dip swap define-tag ; parsing
  
  ! Attribute utilities
  : string>boolean ( string -- ? ) "TRUE" = ;
@@ -32,7 +33,7 @@ IN: xmode.loader.syntax
      [ "NAME" attr ] [ "VALUE" attr ] bi ;
  
  : parse-props-tag ( tag -- assoc )
-    child-tags
+    children-tags
      [ parse-prop-tag ] H{ } map>assoc ;
  
  : position-attrs ( tag -- at-line-start? at-whitespace-end? at-word-start? )
@@ -46,7 +47,8 @@ IN: xmode.loader.syntax
      swap position-attrs <matcher> ;
  
  : parse-regexp-matcher ( tag -- matcher )
-    dup children>string rule-set get ignore-case?>> <regexp>
+    dup children>string
+    rule-set get ignore-case?>> <?insensitive-regexp>
      swap position-attrs <matcher> ;
  
  : shared-tag-attrs ( -- )
@@ -79,22 +81,20 @@ IN: xmode.loader.syntax
      [ parse-literal-matcher >>end drop ] , ;
  
  ! SPAN's children
-<TAGS: parse-begin/end-tag ( rule tag -- )
+TAGS: parse-begin/end-tag ( rule tag -- )
  
-TAG: BEGIN
+TAG: BEGIN parse-begin/end-tag
      ! XXX
      parse-literal-matcher >>start drop ;
  
-TAG: END
+TAG: END parse-begin/end-tag
      ! XXX
      parse-literal-matcher >>end drop ;
  
-TAGS>
-
  : parse-begin/end-tags ( -- )
      [
          ! XXX: handle position attrs on span tag itself
-        child-tags [ parse-begin/end-tag ] with each
+        children-tags [ parse-begin/end-tag ] with each
      ] , ;
  
  : init-span-tag ( -- ) [ drop init-span ] , ;
diff --git a/basis/xmode/marker/marker.factor b/basis/xmode/marker/marker.factor

index cff0af2a981ed41c7cff5f2a8e6d8dc8585c5cb3..e106af79526eb2209de8a46fe9cbeb419df22c04 100755 (executable)
--- a/basis/xmode/marker/marker.factor
+++ b/basis/xmode/marker/marker.factor
@@ -4,8 +4,24 @@ IN: xmode.marker
  USING: kernel namespaces make xmode.rules xmode.tokens
  xmode.marker.state xmode.marker.context xmode.utilities
  xmode.catalog sequences math assocs combinators strings
-parser-combinators.regexp splitting parser-combinators ascii
+regexp splitting ascii regexp.backend unicode.case
  ascii combinators.short-circuit accessors ;
+! regexp.backend is for the regexp class
+
+! Next two words copied from parser-combinators
+! Just like head?, but they optionally ignore case
+
+: string= ( str1 str2 ignore-case -- ? )
+    [ [ >upper ] bi@ ] when sequence= ;
+
+: string-head? ( str1 str2 ignore-case -- ? )
+    2over shorter?
+    [ 3drop f ] [
+        [
+            [ nip ]
+            [ length head-slice ] 2bi
+        ] dip string=
+    ] if ;
  
  ! Based on org.gjt.sp.jedit.syntax.TokenMarker
  
@@ -150,7 +166,7 @@ M: escape-rule handle-rule-start
      process-escape? get [
          escaped? [ not ] change
          position [ + ] change
-    ] [ 2drop ] if ;
+    ] [ drop ] if ;
  
  M: seq-rule handle-rule-start
      ?end-rule
diff --git a/basis/xmode/rules/rules.factor b/basis/xmode/rules/rules.factor

index adc43d7bb6b6364521eb220c564af61dfbcd6436..99364fe7cd605808e74a0815b10149c2870db86e 100644 (file)
--- a/basis/xmode/rules/rules.factor
+++ b/basis/xmode/rules/rules.factor
@@ -1,6 +1,6 @@
  USING: accessors xmode.tokens xmode.keyword-map kernel
  sequences vectors assocs strings memoize unicode.case
-parser-combinators.regexp ;
+regexp regexp.backend ; ! regexp.backend has the regexp class
  IN: xmode.rules
  
  TUPLE: string-matcher string ignore-case? ;
diff --git a/basis/xmode/utilities/utilities-tests.factor b/basis/xmode/utilities/utilities-tests.factor

index 1339430cf8d06a036660b0dafca66e0e7732a407..0ef221f23732e44d69fde9a44ddd316f5c9d1015 100644 (file)
--- a/basis/xmode/utilities/utilities-tests.factor
+++ b/basis/xmode/utilities/utilities-tests.factor
@@ -1,7 +1,6 @@
+USING: assocs xmode.utilities tools.test ;
  IN: xmode.utilities.tests
-USING: accessors xmode.utilities tools.test xml xml.data kernel
-strings vectors sequences io.files prettyprint assocs
-unicode.case ;
+
  [ "hi" 3 ] [
      { 1 2 3 4 5 6 7 8 } [ H{ { 3 "hi" } } at ] map-find
  ] unit-test
@@ -9,44 +8,3 @@ unicode.case ;
  [ f f ] [
      { 1 2 3 4 5 6 7 8 } [ H{ { 11 "hi" } } at ] map-find
  ] unit-test
-
-TUPLE: company employees type ;
-
-: <company> V{ } clone f company boa ;
-
-: add-employee employees>> push ;
-
-<TAGS: parse-employee-tag
-
-TUPLE: employee name description ;
-
-TAG: employee
-    employee new
-    { { "name" f (>>name) } { f (>>description) } }
-    init-from-tag swap add-employee ;
-
-TAGS>
-
-\ parse-employee-tag see
-
-: parse-company-tag
-    [
-        <company>
-        { { "type" >upper (>>type) } }
-        init-from-tag dup
-    ] keep
-    children>> [ tag? ] filter
-    [ parse-employee-tag ] with each ;
-
-[
-    T{ company f
-        V{
-            T{ employee f "Joe" "VP Sales" }
-            T{ employee f "Jane" "CFO" }
-        }
-        "PUBLIC"
-    }
-] [
-    "vocab:xmode/utilities/test.xml"
-    file>xml parse-company-tag
-] unit-test
diff --git a/basis/xmode/utilities/utilities.factor b/basis/xmode/utilities/utilities.factor

index 2423fb0d861cbff37d0e8041a4436157747b8600..1b2b4a352ffede5c44296ae15e3f370b1f3bd4c7 100644 (file)
--- a/basis/xmode/utilities/utilities.factor
+++ b/basis/xmode/utilities/utilities.factor
@@ -1,11 +1,10 @@
  USING: accessors sequences assocs kernel quotations namespaces
-xml.data xml.traversal combinators macros parser lexer words fry ;
+xml.data xml.traversal combinators macros parser lexer words fry
+regexp ;
  IN: xmode.utilities
  
  : implies ( x y -- z ) [ not ] dip or ; inline
  
-: child-tags ( tag -- seq ) children>> [ tag? ] filter ;
-
  : map-find ( seq quot -- result elt )
      [ f ] 2dip
      '[ nip @ dup ] find
@@ -38,20 +37,5 @@ MACRO: (init-from-tag) ( specs -- )
  : init-from-tag ( tag tuple specs -- tuple )
      over [ (init-from-tag) ] dip ; inline
  
-SYMBOL: tag-handlers
-SYMBOL: tag-handler-word
-
-: <TAGS:
-    CREATE tag-handler-word set
-    H{ } clone tag-handlers set ; parsing
-
-: (TAG:) ( name quot -- ) swap tag-handlers get set-at ;
-
-: TAG:
-    scan parse-definition
-    (TAG:) ; parsing
-
-: TAGS>
-    tag-handler-word get
-    tag-handlers get >alist [ [ dup main>> ] dip case ] curry
-    define ; parsing
+: <?insensitive-regexp> ( string ? -- regexp )
+    "i" "" ? <optioned-regexp> ;
diff --git a/core/assocs/assocs.factor b/core/assocs/assocs.factor

index fdaa02e6c42cd4e8815cdc3181271f0de49f63d3..35e7535aa71f33007d651bbc0fbc06feac6fdd27 100755 (executable)
--- a/core/assocs/assocs.factor
+++ b/core/assocs/assocs.factor
@@ -41,7 +41,7 @@ M: assoc assoc-like drop ;
  : substituter ( assoc -- quot )
      [ ?at drop ] curry ; inline
  
-: with-assoc ( assoc quot: ( value key -- assoc ) -- quot: ( key value -- ) )
+: with-assoc ( assoc quot: ( value key assoc -- ) -- quot: ( key value -- ) )
      curry [ swap ] prepose ; inline
  
  PRIVATE>
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Mon, 23 Feb 2009 19:10:54 +0000 (13:10 -0600)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Mon, 23 Feb 2009 19:10:54 +0000 (13:10 -0600)
basis/ascii/ascii.factor		patch \| blob \| history
basis/regexp/ast/ast.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/backend/backend.factor	[deleted file]	patch \| blob \| history
basis/regexp/classes/classes-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/dfa/dfa-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/dfa/dfa.factor		patch \| blob \| history
basis/regexp/disambiguate/disambiguate.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/minimize/minimize-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/minimize/minimize.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/negation/negation-tests.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/negation/negation.factor	[new file with mode: 0644]	patch \| blob
basis/regexp/nfa/nfa.factor		patch \| blob \| history
basis/regexp/parser/parser-tests.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp-docs.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history
basis/regexp/regexp.factor		patch \| blob \| history
basis/regexp/transition-tables/transition-tables.factor		patch \| blob \| history
basis/regexp/traversal/traversal.factor		patch \| blob \| history
basis/regexp/utils/utils-tests.factor	[deleted file]	patch \| blob \| history
basis/regexp/utils/utils.factor	[deleted file]	patch \| blob \| history
basis/xmode/catalog/catalog.factor		patch \| blob \| history
basis/xmode/loader/loader.factor		patch \| blob \| history
basis/xmode/loader/syntax/syntax.factor		patch \| blob \| history
basis/xmode/marker/marker.factor		patch \| blob \| history
basis/xmode/rules/rules.factor		patch \| blob \| history
basis/xmode/utilities/utilities-tests.factor		patch \| blob \| history
basis/xmode/utilities/utilities.factor		patch \| blob \| history
core/assocs/assocs.factor		patch \| blob \| history