--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp regexp.private regexp.classes kernel sequences regexp.negation
+quotations regexp.minimize assocs fry math locals combinators
+accessors words compiler.units ;
+IN: regexp.compiler
+
+: literals>cases ( literal-transitions -- case-body )
+ [ 1quotation ] assoc-map ;
+
+: non-literals>dispatch ( non-literal-transitions -- quot )
+ [ [ '[ dup _ class-member? ] ] [ 1quotation ] bi* ] assoc-map
+ [ 3drop f ] suffix '[ _ cond ] ;
+
+: split-literals ( transitions -- case default )
+ ! Convert disjunction of literals to literals. Also maybe small ranges.
+ >alist [ first integer? ] partition
+ [ literals>cases ] [ non-literals>dispatch ] bi* ;
+
+USING: kernel.private strings sequences.private ;
+
+:: step ( index str case-body final? -- match? )
+ index str bounds-check? [
+ index 1+ str
+ index str nth-unsafe
+ case-body case
+ ] [ final? ] if ; inline
+
+: transitions>quot ( transitions final-state? -- quot )
+ [ split-literals suffix ] dip
+ '[ { array-capacity string } declare _ _ step ] ;
+
+: word>quot ( word dfa -- quot )
+ [ transitions>> at ]
+ [ final-states>> key? ] 2bi
+ transitions>quot ;
+
+: states>code ( words dfa -- )
+ '[
+ [
+ dup _ word>quot
+ (( index string -- ? )) define-declared
+ ] each
+ ] with-compilation-unit ;
+
+: transitions-at ( transitions assoc -- new-transitions )
+ dup '[
+ [ _ at ]
+ [ [ _ at ] assoc-map ] bi*
+ ] assoc-map ;
+
+: states>words ( dfa -- words dfa )
+ dup transitions>> keys [ gensym ] H{ } map>assoc
+ [ [ transitions-at ] rewrite-transitions ]
+ [ values ]
+ bi swap ;
+
+: dfa>word ( dfa -- word )
+ states>words [ states>code ] keep start-state>> ;
+
+: run-regexp ( string word -- ? )
+ [ 0 ] 2dip execute ; inline
+
+: regexp>quotation ( regexp -- quot )
+ compile-regexp dfa>> dfa>word '[ _ run-regexp ] ;
[| trans |
state trans nfa find-closure :> new-state
new-state visited-states new-states add-todo-state
- state new-state trans transition make-transition dfa add-transition
+ state new-state trans dfa add-transition
] each
nfa dfa new-states visited-states new-transitions
] if-empty ;
: unify-final-state ( transition-table -- transition-table )
dup [ final-states>> keys ] keep
- '[ -2 epsilon <literal-transition> _ add-transition ] each
+ '[ -2 epsilon _ add-transition ] each
H{ { -2 -2 } } >>final-states ;
: adjoin-dfa ( transition-table -- start end )
GENERIC: nfa-node ( node -- start-state end-state )
-: add-simple-entry ( obj class -- start-state end-state )
- [ next-state next-state 2dup ] 2dip
- make-transition nfa-table get add-transition ;
+: add-simple-entry ( obj -- start-state end-state )
+ [ next-state next-state 2dup ] dip
+ nfa-table get add-transition ;
: epsilon-transition ( source target -- )
- epsilon <literal-transition> nfa-table get add-transition ;
+ epsilon nfa-table get add-transition ;
M:: star nfa-node ( node -- start end )
node term>> nfa-node :> s1 :> s0
s2 s3 ;
M: tagged-epsilon nfa-node
- literal-transition add-simple-entry ;
+ add-simple-entry ;
M: concatenation nfa-node ( node -- start end )
[ first>> ] [ second>> ] bi
] when ;
M: integer nfa-node ( node -- start end )
- modify-class dup class?
- class-transition literal-transition ?
- add-simple-entry ;
+ modify-class add-simple-entry ;
M: primitive-class modify-class
class>> modify-class <primitive-class> ;
] when ;
M: class nfa-node
- modify-class class-transition add-simple-entry ;
+ modify-class add-simple-entry ;
M: with-options nfa-node ( node -- start end )
dup options>> [ tree>> nfa-node ] using-options ;
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
-! [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test ! FIXME
+
+[ t ] [ "xabc" R/ abc/ match-from-end >boolean ] unit-test
+[ t ] [ "xabc" R/ a[bB][cC]/ match-from-end >boolean ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
{ raw read-only }
{ parse-tree read-only }
{ options read-only }
- dfa ;
+ dfa reverse-dfa ;
: make-regexp ( string ast -- regexp )
- f f <options> f regexp boa ; foldable
+ f f <options> f f regexp boa ; foldable
! Foldable because, when the dfa slot is set,
! it'll be set to the same thing regardless of who sets it
: <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi*
- f regexp boa ;
+ f f regexp boa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
<PRIVATE
+: get-ast ( regexp -- ast )
+ [ parse-tree>> ] [ options>> ] bi <with-options> ;
+
: compile-regexp ( regexp -- regexp )
- dup dfa>> [
- dup
- [ parse-tree>> ]
- [ options>> ] bi
- <with-options> ast>dfa
- >>dfa
- ] unless ;
+ dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ;
+
+: <reversed-option> ( ast -- reversed )
+ "r" string>options <with-options> ;
+
+: compile-reverse ( regexp -- regexp )
+ dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
: (match) ( string regexp -- dfa-traverser )
- compile-regexp dfa>> <dfa-traverser> do-match ; inline
+ compile-regexp dfa>> <dfa-traverser> do-match ;
+
+: (match-reversed) ( string regexp -- dfa-traverser )
+ [ <reversed> ] [ compile-reverse reverse-dfa>> ] bi*
+ <dfa-traverser> do-match ;
PRIVATE>
: match ( string regexp -- slice/f )
(match) return-match ;
+: match-from-end ( string regexp -- slice/f )
+ (match-reversed) return-match ;
+
: matches? ( string regexp -- ? )
dupd match
[ [ length ] bi@ = ] [ drop f ] if* ;
{ "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ;
+: take-until ( end lexer -- string )
+ dup skip-blank [
+ [ index-from ] 2keep
+ [ swapd subseq ]
+ [ 2drop 1+ ] 3bi
+ ] change-lexer-column ;
+
+: parse-noblank-token ( lexer -- str/f )
+ dup still-parsing-line? [ (parse-token) ] [ drop f ] if ;
+
: parsing-regexp ( accum end -- accum )
- lexer get dup skip-blank
- [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
- lexer get dup still-parsing-line?
- [ (parse-token) ] [ drop f ] if
+ lexer get [ take-until ] [ parse-noblank-token ] bi
<optioned-regexp> compile-regexp parsed ;
PRIVATE>
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences
-vectors ;
+vectors locals ;
IN: regexp.transition-tables
-TUPLE: transition from to obj ;
-TUPLE: literal-transition < transition ;
-TUPLE: class-transition < transition ;
-TUPLE: default-transition < transition ;
-
-TUPLE: literal obj ;
-TUPLE: class obj ;
-TUPLE: default ;
-: make-transition ( from to obj class -- obj )
- new
- swap >>obj
- swap >>to
- swap >>from ;
-
-: <literal-transition> ( from to obj -- transition )
- literal-transition make-transition ;
-
-: <class-transition> ( from to obj -- transition )
- class-transition make-transition ;
-
-: <default-transition> ( from to -- transition )
- t default-transition make-transition ;
-
TUPLE: transition-table transitions start-state final-states ;
: <transition-table> ( -- transition-table )
: maybe-initialize-key ( key hashtable -- )
2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
-: set-transition ( transition hash -- )
- #! set the state as a key
- 2dup [ to>> ] dip maybe-initialize-key
- [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
- 2dup at* [ 2nip push-at ]
- [ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ;
+:: set-transition ( from to obj hash -- )
+ to hash maybe-initialize-key
+ from hash at
+ [ [ to obj ] dip push-at ]
+ [ to 1vector obj associate from hash set-at ] if* ;
-: add-transition ( transition transition-table -- )
+: add-transition ( from to obj transition-table -- )
transitions>> set-transition ;
dfa-table
current-state
text
- match-failed?
start-index current-index
matches ;
[ current-state>> ]
[ dfa-table>> final-states>> ] bi key? ;
-: beginning-of-text? ( dfa-traverser -- ? )
- current-index>> 0 <= ; inline
-
: end-of-text? ( dfa-traverser -- ? )
[ current-index>> ] [ text>> length ] bi >= ; inline
{
[ current-state>> not ]
[ end-of-text? ]
- [ match-failed?>> ]
} 1|| ;
: save-final-state ( dfa-straverser -- )
1 text-character ;
: increment-state ( dfa-traverser state -- dfa-traverser )
- [ [ 1 + ] change-current-index ] dip >>current-state ;
+ >>current-state
+ [ 1 + ] change-current-index ;
: match-literal ( transition from-state table -- to-state/f )
transitions>> at at ;
swap '[ drop _ swap class-member? ] assoc-find spin ?
] [ drop ] if ;
-: match-default ( transition from-state table -- to-state/f )
- [ drop ] 2dip transitions>> at t swap at ;
-
: match-transition ( obj from-state dfa -- to-state/f )
- { [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
+ { [ match-literal ] [ match-class ] } 3|| ;
: setup-match ( match -- obj state dfa-table )
[ [ current-index>> ] [ text>> ] bi nth ]
dup matches>>
[ drop f ]
[
- [ [ text>> ] [ start-index>> ] bi ]
- [ peek ] bi* rot <slice>
+ [ [ start-index>> ] [ text>> ] bi ]
+ [ peek ] bi* swap <slice>
] if-empty ;