: transitions>quot ( transitions final-state? -- quot )
[ split-literals suffix ] dip
- '[ _ _ step ] ;
- ! '[ { array-capacity string } declare _ _ step ] ;
+ '[ { array-capacity string } declare _ _ step ] ;
: word>quot ( word dfa -- quot )
[ transitions>> at ]
: dfa>word ( dfa -- word )
states>words [ states>code ] keep start-state>> ;
-: run-regexp ( string word -- ? )
- [ f 0 ] 2dip execute ; inline
+: check-string ( string -- string )
+ dup string? [ "String required" throw ] unless ;
+
+: run-regexp ( start-index string word -- ? )
+ { [ f ] [ >fixnum ] [ check-string ] [ execute ] } spread ; inline
: dfa>quotation ( dfa -- quot )
dfa>word '[ _ run-regexp ] ;
TUPLE: quot-matcher quot ;
C: <quot-matcher> quot-matcher
-M: quot-matcher match-index
- quot>> call( string -- i/f ) ;
+M: quot-matcher match-index-from
+ quot>> call( index string -- i/f ) ;
! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences math splitting make fry ;
+USING: kernel sequences math splitting make fry locals math.ranges
+accessors arrays ;
IN: regexp.matchers
! For now, a matcher is just something with a method to do the
! equivalent of match.
-! matcher protocol:
-GENERIC: match-index ( string matcher -- index/f )
+GENERIC: match-index-from ( i string matcher -- index/f )
-: match ( string matcher -- slice/f )
- dupd match-index [ head-slice ] [ drop f ] if* ;
+: match-index-head ( string matcher -- index/f )
+ [ 0 ] 2dip match-index-from ;
+
+: match-slice ( i string matcher -- slice/f )
+ [ 2dup ] dip match-index-from
+ [ swap <slice> ] [ 2drop f ] if* ;
: matches? ( string matcher -- ? )
- dupd match-index
+ dupd match-index-head
[ swap length = ] [ drop f ] if* ;
-: match-head ( string matcher -- end/f ) match [ length ] [ f ] if* ;
+: map-find ( seq quot -- result elt )
+ [ f ] 2dip
+ '[ nip @ dup ] find
+ [ [ drop f ] unless ] dip ; inline
-: match-at ( string m matcher -- n/f finished? )
- [
- 2dup swap length > [ 2drop f f ] [ tail-slice t ] if
- ] dip swap [ match-head f ] [ 2drop f t ] if ;
+:: match-from ( i string matcher -- slice/f )
+ i string length [a,b)
+ [ string matcher match-slice ] map-find drop ;
-: match-range ( string m matcher -- a/f b/f )
- 3dup match-at over [
- drop nip rot drop dupd +
- ] [
- [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
- ] if ;
+: match-head ( str matcher -- slice/f )
+ [ 0 ] 2dip match-from ;
-: first-match ( string matcher -- slice/f )
- dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
+: next-match ( i string matcher -- i match/f )
+ match-from [ dup [ to>> ] when ] keep ;
-: re-cut ( string matcher -- end/f start )
- dupd first-match
- [ split1-slice swap ] [ "" like f swap ] if* ;
+:: all-matches ( string matcher -- seq )
+ 0 [ dup ] [ string matcher next-match ] [ ] produce nip but-last ;
+
+: count-matches ( string matcher -- n )
+ all-matches length ;
<PRIVATE
-: (re-split) ( string matcher -- )
- over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
+:: split-slices ( string slices -- new-slices )
+ slices [ to>> ] map 0 prefix
+ slices [ from>> ] map string length suffix
+ [ string <slice> ] 2map ;
PRIVATE>
+: re-split1 ( string matcher -- before after/f )
+ dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
+
: re-split ( string matcher -- seq )
- [ (re-split) ] { } make ;
+ dupd all-matches split-slices ;
: re-replace ( string matcher replacement -- result )
[ re-split ] dip join ;
-
-: next-match ( string matcher -- end/f match/f )
- dupd first-match dup
- [ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
-
-: all-matches ( string matcher -- seq )
- [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
-
-: count-matches ( string matcher -- n )
- all-matches length ;
[ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
[ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
-[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
-[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
+[ 3 ] [ "aaacb" "a*" <regexp> match-index-head ] unit-test
+[ 2 ] [ "aaacb" "aa?" <regexp> match-index-head ] unit-test
[ t ] [ "aaa" R/ AAA/i matches? ] unit-test
[ f ] [ "aax" R/ AAA/i matches? ] unit-test
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
-[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
-[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
+[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
+[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
-[ t ] [ "xabc" R/ abc/ <reverse-matcher> match >boolean ] unit-test
-[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match >boolean ] unit-test
+[ t ] [ "xabc" R/ abc/ <reverse-matcher> match-index-head >boolean ] unit-test
+[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-head >boolean ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
-[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
-[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
+[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
+[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
-[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
-[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
+[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
+[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
-[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test
+[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> match-head >string ] unit-test
! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
/*
-[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
-[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
+[ f ] [ "ab" "a(?!b)" <regexp> match-head ] unit-test
+[ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
-[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
-[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
-[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
+[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test
+[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> match-head >string ] unit-test
+[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> match-head >string ] unit-test
-[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
-[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
+[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-index-head ] unit-test
+[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-index-head ] unit-test
*/
! Bug in parsing word
! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
-! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
-! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
+! [ f ] [ "foobxr" "foo\\z" <regexp> match-index-head ] unit-test
+! [ 3 ] [ "foo" "foo\\z" <regexp> match-index-head ] unit-test
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
-! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
+! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
-! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test
+! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
-! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
-! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
+! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
+! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
-! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
-! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
+! [ 3 ] [ "aacb" "aa?c" <regexp> match-index-head ] unit-test
+! [ 3 ] [ "aacb" "aa??c" <regexp> match-index-head ] unit-test
! "ab" "a(?=b*)" <regexp> match
! "abbbbbc" "a(?=b*c)" <regexp> match
: compile-reverse ( regexp -- regexp )
dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
-M: regexp match-index ( string regexp -- index/f )
+M: regexp match-index-from ( string regexp -- index/f )
dup dfa-quot>>
[ <quot-matcher> ]
[ compile-regexp dfa>> <dfa-matcher> ] ?if
- match-index ;
+ match-index-from ;
-M: reverse-matcher match-index ( string regexp -- index/f )
+M: reverse-matcher match-index-from ( string regexp -- index/f )
[ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
<dfa-traverser> do-match match-index>> ;
current-index
match-index ;
-: <dfa-traverser> ( text dfa -- match )
+: <dfa-traverser> ( start-index text dfa -- match )
dfa-traverser new
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
swap >>text
- 0 >>current-index ;
+ swap >>current-index ;
: final-state? ( dfa-traverser -- ? )
[ current-state>> ]
TUPLE: dfa-matcher dfa ;
C: <dfa-matcher> dfa-matcher
-M: dfa-matcher match-index
+M: dfa-matcher match-index-from
dfa>> <dfa-traverser> do-match match-index>> ;