! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.classes kernel sequences regexp.negation
-quotations regexp.minimize assocs fry math locals combinators
+quotations assocs fry math locals combinators
accessors words compiler.units kernel.private strings
-sequences.private arrays regexp.matchers call namespaces
+sequences.private arrays call namespaces
regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler
GENERIC: question>quot ( question -- quot )
-<PRIVATE
-
SYMBOL: shortest?
SYMBOL: backwards?
+<PRIVATE
+
M: t question>quot drop [ 2drop t ] ;
M: beginning-of-input question>quot
[ values ]
bi swap ;
-: dfa>word ( dfa -- word )
+: dfa>main-word ( dfa -- word )
states>words [ states>code ] keep start-state>> ;
-: check-string ( string -- string )
- ! Make this configurable
- dup string? [ "String required" throw ] unless ;
-
-: setup-regexp ( start-index string -- f start-index string )
- [ f ] [ >fixnum ] [ check-string ] tri* ; inline
-
PRIVATE>
-! The quotation returned is ( start-index string -- i/f )
-
-: dfa>quotation ( dfa -- quot )
- dfa>word execution-quot '[ setup-regexp @ ] ;
-
-: dfa>shortest-quotation ( dfa -- quot )
- t shortest? [ dfa>quotation ] with-variable ;
+: simple-define-temp ( quot effect -- word )
+ [ [ define-temp ] with-compilation-unit ] call( quot effect -- word ) ;
-: dfa>reverse-quotation ( dfa -- quot )
- t backwards? [ dfa>quotation ] with-variable ;
+: dfa>word ( dfa -- quot )
+ dfa>main-word execution-quot '[ drop [ f ] 2dip @ ]
+ (( start-index string regexp -- i/f )) simple-define-temp ;
-: dfa>reverse-shortest-quotation ( dfa -- quot )
- t backwards? [ dfa>shortest-quotation ] with-variable ;
+: dfa>shortest-word ( dfa -- word )
+ t shortest? [ dfa>word ] with-variable ;
-TUPLE: quot-matcher quot ;
-C: <quot-matcher> quot-matcher
+: dfa>reverse-word ( dfa -- word )
+ t backwards? [ dfa>word ] with-variable ;
-M: quot-matcher match-index-from
- quot>> call( index string -- i/f ) ;
+: dfa>reverse-shortest-word ( dfa -- word )
+ t backwards? [ dfa>shortest-word ] with-variable ;
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: regexp tools.test kernel sequences regexp.parser regexp.private
-eval strings multiline accessors regexp.matchers ;
+eval strings multiline accessors ;
IN: regexp-tests
\ <regexp> must-infer
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
-[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
-[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
+[ t ] [ "abc" R/ abc/r matches? ] unit-test
+[ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test
-[ t ] [ 3 "xabc" R/ abc/ <reverse-matcher> match-index-from >boolean ] unit-test
-[ t ] [ 3 "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-from >boolean ] unit-test
+[ t ] [ 3 "xabc" R/ abc/r match-index-from >boolean ] unit-test
+[ t ] [ 3 "xabc" R/ a[bB][cC]/r match-index-from >boolean ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ t ] [ "aaaa" R/ .*a./ matches? ] unit-test
+[ f ] [ "ab" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test
+[ f ] [ "ab" R/ (?~ac|[a-z]b)/ matches? ] unit-test
+[ f ] [ "ac" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test
+[ f ] [ "ac" R/ (?~ac|[a-z]b)/ matches? ] unit-test
+[ f ] [ "ac" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
+[ f ] [ "ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
+[ f ] [ "πb" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
+[ t ] [ "πc" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
+[ t ] [ "Ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
+
! DFA is compiled when needed, or when literal
-[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
-[ t ] [ R/ foo/ dfa>> >boolean ] unit-test
+[ regexp-initial-word ] [ "foo" <regexp> dfa>> ] unit-test
+[ f ] [ R/ foo/ dfa>> \ regexp-initial-word = ] unit-test
[ t ] [ "a" R/ ^a/ matches? ] unit-test
[ f ] [ "\na" R/ ^a/ matches? ] unit-test
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer
-namespaces parser arrays fry locals regexp.minimize
-regexp.parser regexp.nfa regexp.dfa regexp.classes
-regexp.transition-tables splitting sorting regexp.ast
-regexp.negation regexp.matchers regexp.compiler ;
+namespaces parser arrays fry locals regexp.parser splitting
+sorting regexp.ast regexp.negation regexp.compiler words
+call call.private math.ranges ;
IN: regexp
TUPLE: regexp
{ raw read-only }
{ parse-tree read-only }
{ options read-only }
- dfa reverse-dfa ;
+ dfa next-match ;
-: make-regexp ( string ast -- regexp )
- f f <options> f f regexp boa ; foldable
- ! Foldable because, when the dfa slot is set,
- ! it'll be set to the same thing regardless of who sets it
+TUPLE: reverse-regexp < regexp ;
-: <optioned-regexp> ( string options -- regexp )
- [ dup parse-regexp ] [ string>options ] bi*
- f f regexp boa ;
+<PRIVATE
-: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
+: maybe-negated ( lookaround quot -- regexp-quot )
+ '[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
+
+M: lookahead question>quot ! Returns ( index string -- ? )
+ [ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ;
-TUPLE: reverse-matcher regexp ;
-C: <reverse-matcher> reverse-matcher
-! Reverse matchers won't work properly with most combinators, for now
+: <reversed-option> ( ast -- reversed )
+ "r" string>options <with-options> ;
+
+M: lookbehind question>quot ! Returns ( index string -- ? )
+ [
+ <reversed-option>
+ ast>dfa dfa>reverse-shortest-word
+ '[ [ 1- ] dip f _ execute ]
+ ] maybe-negated ;
+
+<PRIVATE
+
+: check-string ( string -- string )
+ ! Make this configurable
+ dup string? [ "String required" throw ] unless ;
+
+: match-index-from ( i string regexp -- index/f )
+ ! This word is unsafe. It assumes that i is a fixnum
+ ! and that string is a string.
+ dup dfa>> execute( index string regexp -- i/f ) ;
+
+: match-index-head ( string regexp -- index/f )
+ [ 0 ] 2dip [ check-string ] dip match-index-from ;
+
+PRIVATE>
+
+: matches? ( string regexp -- ? )
+ dupd match-index-head
+ [ swap length = ] [ drop f ] if* ;
+
+<PRIVATE
+
+: match-slice ( i string quot -- slice/f )
+ [ 2dup ] dip call
+ [ swap <slice> ] [ 2drop f ] if* ; inline
+
+: match-from ( i string quot -- slice/f )
+ [ [ length [a,b) ] keep ] dip
+ '[ _ _ match-slice ] map-find drop ; inline
+
+: next-match ( i string quot -- i match/f )
+ match-from [ dup [ to>> ] when ] keep ; inline
+
+: do-next-match ( i string regexp -- i match/f )
+ dup next-match>> execute( i string regexp -- i match/f ) ;
+
+PRIVATE>
+
+: all-matches ( string regexp -- seq )
+ [ check-string ] dip
+ [ 0 [ dup ] ] 2dip '[ _ _ do-next-match ] produce
+ nip but-last ;
+
+: count-matches ( string regexp -- n )
+ all-matches length ;
+
+<PRIVATE
+
+:: split-slices ( string slices -- new-slices )
+ slices [ to>> ] map 0 prefix
+ slices [ from>> ] map string length suffix
+ [ string <slice> ] 2map ;
+
+: match-head ( str regexp -- slice/f )
+ [
+ [ 0 ] [ check-string ] [ dup dfa>> '[ _ _ execute ] ] tri*
+ match-from
+ ] call( str regexp -- slice/f ) ;
+
+PRIVATE>
+
+: re-split1 ( string regexp -- before after/f )
+ dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
+
+: re-split ( string regexp -- seq )
+ dupd all-matches split-slices ;
+
+: re-replace ( string regexp replacement -- result )
+ [ re-split ] dip join ;
<PRIVATE
: get-ast ( regexp -- ast )
[ parse-tree>> ] [ options>> ] bi <with-options> ;
-: compile-regexp ( regexp -- regexp )
- dup '[ [ _ get-ast ast>dfa dfa>quotation ] unless* ] change-dfa ;
+GENERIC: compile-regexp ( regex -- regexp )
-: <reversed-option> ( ast -- reversed )
- "r" string>options <with-options> ;
+: regexp-initial-word ( i string regexp -- i/f )
+ compile-regexp match-index-from ;
-: maybe-negated ( lookaround quot -- regexp-quot )
- '[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
+: do-compile-regexp ( regexp -- regexp )
+ dup '[
+ dup \ regexp-initial-word =
+ [ drop _ get-ast ast>dfa dfa>word ] when
+ ] change-dfa ;
-M: lookahead question>quot ! Returns ( index string -- ? )
- [ ast>dfa dfa>shortest-quotation ] maybe-negated ;
+M: regexp compile-regexp ( regexp -- regexp )
+ do-compile-regexp ;
-M: lookbehind question>quot ! Returns ( index string -- ? )
- [
- <reversed-option>
- ast>dfa dfa>reverse-shortest-quotation
- [ [ 1- ] dip ] prepose
- ] maybe-negated ;
+M: reverse-regexp compile-regexp ( regexp -- regexp )
+ t backwards? [ do-compile-regexp ] with-variable ;
+
+GENERIC: compile-next-match ( regexp -- regexp )
-: compile-reverse ( regexp -- regexp )
+: next-initial-word ( i string regexp -- i slice/f )
+ compile-next-match do-next-match ;
+
+M: regexp compile-next-match ( regexp -- regexp )
dup '[
- [
- _ get-ast <reversed-option>
- ast>dfa dfa>reverse-quotation
- ] unless*
- ] change-reverse-dfa ;
+ dup \ next-initial-word = [
+ drop _ compile-regexp dfa>>
+ '[ _ '[ _ _ execute ] next-match ]
+ (( i string -- i match/f )) simple-define-temp
+ ] when
+ ] change-next-match ;
-M: regexp match-index-from
- compile-regexp dfa>> <quot-matcher> match-index-from ;
+! Write M: reverse-regexp compile-next-match
-M: reverse-matcher match-index-from
- regexp>> compile-reverse reverse-dfa>>
- <quot-matcher> match-index-from ;
+PRIVATE>
+
+: new-regexp ( string ast options class -- regexp )
+ [ \ regexp-initial-word \ next-initial-word ] dip boa ; inline
+
+: make-regexp ( string ast -- regexp )
+ f f <options> regexp new-regexp ;
+
+: <optioned-regexp> ( string options -- regexp )
+ [ dup parse-regexp ] [ string>options ] bi*
+ dup on>> reversed-regexp swap member?
+ [ reverse-regexp new-regexp ]
+ [ regexp new-regexp ] if ;
+
+: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
+
+<PRIVATE
! The following two should do some caching
: parsing-regexp ( accum end -- accum )
lexer get [ take-until ] [ parse-noblank-token ] bi
- <optioned-regexp> compile-regexp parsed ;
+ <optioned-regexp> compile-next-match parsed ;
PRIVATE>
[ options>> options>string % ] bi
] "" make
] keep present-text ;
+