Reorganizing regexp matcher protocol

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Fri, 27 Feb 2009 04:14:41 +0000 (22:14 -0600)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Fri, 27 Feb 2009 04:14:41 +0000 (22:14 -0600)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Fri, 27 Feb 2009 04:14:41 +0000 (22:14 -0600)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Fri, 27 Feb 2009 04:14:41 +0000 (22:14 -0600)
diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor

index fa3e67d1f9e657251767f28d68dba6a3dbbde78e..7fda0103517cd3aee3a98d13db1b39061bf9dbae 100644 (file)
--- a/basis/regexp/compiler/compiler.factor
+++ b/basis/regexp/compiler/compiler.factor
@@ -36,8 +36,7 @@ IN: regexp.compiler
  
  : transitions>quot ( transitions final-state? -- quot )
      [ split-literals suffix ] dip
-    '[ _ _ step ] ;
-    ! '[ { array-capacity string } declare _ _ step ] ;
+    '[ { array-capacity string } declare _ _ step ] ;
  
  : word>quot ( word dfa -- quot )
      [ transitions>> at ]
@@ -68,8 +67,11 @@ IN: regexp.compiler
  : dfa>word ( dfa -- word )
      states>words [ states>code ] keep start-state>> ;
  
-: run-regexp ( string word -- ? )
-    [ f 0 ] 2dip execute ; inline
+: check-string ( string -- string )
+    dup string? [ "String required" throw ] unless ;
+
+: run-regexp ( start-index string word -- ? )
+    { [ f ] [ >fixnum ] [ check-string ] [ execute ] } spread ; inline
  
  : dfa>quotation ( dfa -- quot )
      dfa>word '[ _ run-regexp ] ;
@@ -77,5 +79,5 @@ IN: regexp.compiler
  TUPLE: quot-matcher quot ;
  C: <quot-matcher> quot-matcher
  
-M: quot-matcher match-index
-    quot>> call( string -- i/f ) ;
+M: quot-matcher match-index-from
+    quot>> call( index string -- i/f ) ;
diff --git a/basis/regexp/matchers/matchers.factor b/basis/regexp/matchers/matchers.factor

index 7ac1edf58cb1a2d02f3a4ecdc8b24e9f7278dac9..1c45dade71bb60b9f08bfcb4e2d3c6b05c8bdb88 100644 (file)
--- a/basis/regexp/matchers/matchers.factor
+++ b/basis/regexp/matchers/matchers.factor
@@ -1,61 +1,60 @@
  ! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences math splitting make fry ;
+USING: kernel sequences math splitting make fry locals math.ranges
+accessors arrays ;
  IN: regexp.matchers
  
  ! For now, a matcher is just something with a method to do the
  ! equivalent of match.
  
-! matcher protocol:
-GENERIC: match-index ( string matcher -- index/f )
+GENERIC: match-index-from ( i string matcher -- index/f )
  
-: match ( string matcher -- slice/f )
-    dupd match-index [ head-slice ] [ drop f ] if* ;
+: match-index-head ( string matcher -- index/f )
+    [ 0 ] 2dip match-index-from ;
+
+: match-slice ( i string matcher -- slice/f )
+    [ 2dup ] dip match-index-from
+    [ swap <slice> ] [ 2drop f ] if* ;
  
  : matches? ( string matcher -- ? )
-    dupd match-index
+    dupd match-index-head
      [ swap length = ] [ drop f ] if* ;
  
-: match-head ( string matcher -- end/f ) match [ length ] [ f ] if* ;
+: map-find ( seq quot -- result elt )
+    [ f ] 2dip
+    '[ nip @ dup ] find
+    [ [ drop f ] unless ] dip ; inline
  
-: match-at ( string m matcher -- n/f finished? )
-    [
-        2dup swap length > [ 2drop f f ] [ tail-slice t ] if
-    ] dip swap [ match-head f ] [ 2drop f t ] if ;
+:: match-from ( i string matcher -- slice/f )
+    i string length [a,b)
+    [ string matcher match-slice ] map-find drop ;
  
-: match-range ( string m matcher -- a/f b/f )
-    3dup match-at over [
-        drop nip rot drop dupd +
-    ] [
-        [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
-    ] if ;
+: match-head ( str matcher -- slice/f )
+    [ 0 ] 2dip match-from ;
  
-: first-match ( string matcher -- slice/f )
-    dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
+: next-match ( i string matcher -- i match/f )
+    match-from [ dup [ to>> ] when ] keep ;
  
-: re-cut ( string matcher -- end/f start )
-    dupd first-match
-    [ split1-slice swap ] [ "" like f swap ] if* ;
+:: all-matches ( string matcher -- seq )
+    0 [ dup ] [ string matcher next-match ] [ ] produce nip but-last ;
+
+: count-matches ( string matcher -- n )
+    all-matches length ;
  
  <PRIVATE
  
-: (re-split) ( string matcher -- )
-    over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
+:: split-slices ( string slices -- new-slices )
+    slices [ to>> ] map 0 prefix
+    slices [ from>> ] map string length suffix
+    [ string <slice> ] 2map ;
  
  PRIVATE>
  
+: re-split1 ( string matcher -- before after/f )
+    dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
+
  : re-split ( string matcher -- seq )
-    [ (re-split) ] { } make ;
+    dupd all-matches split-slices ;
  
  : re-replace ( string matcher replacement -- result )
      [ re-split ] dip join ;
-
-: next-match ( string matcher -- end/f match/f )
-    dupd first-match dup
-    [ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
-
-: all-matches ( string matcher -- seq )
-    [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
-
-: count-matches ( string matcher -- n )
-    all-matches length ;
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index cbc582b295ca04b8196e32a81df612f4c4c53988..f4382b50780f7cc0fd017dfd243af2e302bead99 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -208,8 +208,8 @@ IN: regexp-tests
  [ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
  [ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
  
-[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
-[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
+[ 3 ] [ "aaacb" "a*" <regexp> match-index-head ] unit-test
+[ 2 ] [ "aaacb" "aa?" <regexp> match-index-head ] unit-test
  
  [ t ] [ "aaa" R/ AAA/i matches? ] unit-test
  [ f ] [ "aax" R/ AAA/i matches? ] unit-test
@@ -238,11 +238,11 @@ IN: regexp-tests
  [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
  [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
  
-[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
-[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
+[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
+[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
  
-[ t ] [ "xabc" R/ abc/ <reverse-matcher> match >boolean ] unit-test
-[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match >boolean ] unit-test
+[ t ] [ "xabc" R/ abc/ <reverse-matcher> match-index-head >boolean ] unit-test
+[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-head >boolean ] unit-test
  
  [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
  [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@@ -267,13 +267,13 @@ IN: regexp-tests
  
  [ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
  
-[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
-[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
+[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
+[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
  
-[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
-[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
+[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
+[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
  
-[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test
+[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> match-head >string ] unit-test
  
  ! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
  
@@ -304,16 +304,16 @@ IN: regexp-tests
  [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
  
  /*
-[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
-[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
+[ f ] [ "ab" "a(?!b)" <regexp> match-head ] unit-test
+[ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test
  [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
  [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
-[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
-[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
-[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
+[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test
+[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> match-head >string ] unit-test
+[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> match-head >string ] unit-test
  
-[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
-[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
+[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-index-head ] unit-test
+[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-index-head ] unit-test
  */
  
  ! Bug in parsing word
@@ -393,15 +393,15 @@ IN: regexp-tests
  ! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
  ! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
  
-! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
-! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
+! [ f ] [ "foobxr" "foo\\z" <regexp> match-index-head ] unit-test
+! [ 3 ] [ "foo" "foo\\z" <regexp> match-index-head ] unit-test
  
  ! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
  ! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
  ! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
  ! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
  
-! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
+! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
  ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
  ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
  ! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
@@ -409,18 +409,18 @@ IN: regexp-tests
  ! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
  
  ! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
-! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test
+! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
  ! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
  ! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
  ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
  ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
  
-! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
-! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
+! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
+! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
  ! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
  ! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
-! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
-! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
+! [ 3 ] [ "aacb" "aa?c" <regexp> match-index-head ] unit-test
+! [ 3 ] [ "aacb" "aa??c" <regexp> match-index-head ] unit-test
  
  ! "ab" "a(?=b*)" <regexp> match
  ! "abbbbbc" "a(?=b*c)" <regexp> match
diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor

index 45660ad309a554583105e251f1d8ae341244ef0e..0502cb4d4b765b9176fa422bea6bbdc2d37d5ce3 100644 (file)
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@@ -45,13 +45,13 @@ C: <reverse-matcher> reverse-matcher
  : compile-reverse ( regexp -- regexp )
      dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
  
-M: regexp match-index ( string regexp -- index/f )
+M: regexp match-index-from ( string regexp -- index/f )
      dup dfa-quot>>
      [ <quot-matcher> ]
      [ compile-regexp dfa>> <dfa-matcher> ] ?if
-    match-index ;
+    match-index-from ;
  
-M: reverse-matcher match-index ( string regexp -- index/f )
+M: reverse-matcher match-index-from ( string regexp -- index/f )
      [ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
      <dfa-traverser> do-match match-index>> ;
  
diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor

index e215cde4161b556232db27eb22e48f5059563eda..b890ca7e122e15f635283a9b58d6aaac50b657a3 100644 (file)
--- a/basis/regexp/traversal/traversal.factor
+++ b/basis/regexp/traversal/traversal.factor
@@ -12,11 +12,11 @@ TUPLE: dfa-traverser
      current-index
      match-index ;
  
-: <dfa-traverser> ( text dfa -- match )
+: <dfa-traverser> ( start-index text dfa -- match )
      dfa-traverser new
          swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
          swap >>text
-        0 >>current-index ;
+        swap >>current-index ;
  
  : final-state? ( dfa-traverser -- ? )
      [ current-state>> ]
@@ -65,5 +65,5 @@ TUPLE: dfa-traverser
  
  TUPLE: dfa-matcher dfa ;
  C: <dfa-matcher> dfa-matcher
-M: dfa-matcher match-index 
+M: dfa-matcher match-index-from
      dfa>> <dfa-traverser> do-match match-index>> ;
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Fri, 27 Feb 2009 04:14:41 +0000 (22:14 -0600)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Fri, 27 Feb 2009 04:14:41 +0000 (22:14 -0600)
basis/regexp/compiler/compiler.factor		patch \| blob \| history
basis/regexp/matchers/matchers.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history
basis/regexp/regexp.factor		patch \| blob \| history
basis/regexp/traversal/traversal.factor		patch \| blob \| history