! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: regexp regexp.private regexp.classes kernel sequences regexp.negation
-quotations regexp.minimize assocs fry math locals combinators
-accessors words compiler.units ;
+USING: regexp.classes kernel sequences regexp.negation
+quotations assocs fry math locals combinators
+accessors words compiler.units kernel.private strings
+sequences.private arrays namespaces unicode.breaks
+regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler
-: literals>cases ( literal-transitions -- case-body )
- [ 1quotation ] assoc-map ;
+GENERIC: question>quot ( question -- quot )
+
+SYMBOL: shortest?
+SYMBOL: backwards?
+
+<PRIVATE
+
+M: t question>quot drop [ 2drop t ] ;
+M: f question>quot drop [ 2drop f ] ;
+
+M: beginning-of-input question>quot
+ drop [ drop zero? ] ;
+
+M: end-of-input question>quot
+ drop [ length = ] ;
+
+M: end-of-file question>quot
+ drop [
+ {
+ [ length swap - 2 <= ]
+ [ swap tail { "\n" "\r\n" "\r" "" } member? ]
+ } 2&&
+ ] ;
+
+M: $ question>quot
+ drop [ { [ length = ] [ ?nth "\r\n" member? ] } 2|| ] ;
+
+M: ^ question>quot
+ drop [ { [ drop zero? ] [ [ 1 - ] dip ?nth "\r\n" member? ] } 2|| ] ;
+
+M: $unix question>quot
+ drop [ { [ length = ] [ ?nth CHAR: \n = ] } 2|| ] ;
+
+M: ^unix question>quot
+ drop [ { [ drop zero? ] [ [ 1 - ] dip ?nth CHAR: \n = ] } 2|| ] ;
+
+M: word-break question>quot
+ drop [ word-break-at? ] ;
-: non-literals>dispatch ( non-literal-transitions -- quot )
- [ [ '[ dup _ class-member? ] ] [ 1quotation ] bi* ] assoc-map
- [ 3drop f ] suffix '[ _ cond ] ;
+: (execution-quot) ( next-state -- quot )
+ ! The conditions here are for lookaround and anchors, etc
+ dup condition? [
+ [ question>> question>quot ] [ yes>> ] [ no>> ] tri
+ [ (execution-quot) ] bi@
+ '[ 2dup @ _ _ if ]
+ ] [ '[ _ execute ] ] if ;
+
+: execution-quot ( next-state -- quot )
+ dup sequence? [ first ] when
+ (execution-quot) ;
+
+TUPLE: box contents ;
+C: <box> box
+
+: condition>quot ( condition -- quot )
+ ! Conditions here are for different classes
+ dup condition? [
+ [ question>> ] [ yes>> ] [ no>> ] tri
+ [ condition>quot ] bi@
+ '[ dup _ class-member? _ _ if ]
+ ] [
+ contents>>
+ [ [ 3drop ] ] [ execution-quot '[ drop @ ] ] if-empty
+ ] if ;
+
+: non-literals>dispatch ( literals non-literals -- quot )
+ [ swap ] assoc-map ! we want state => predicate, and get the opposite as input
+ swap keys f assoc-answers
+ table>condition [ <box> ] condition-map condition>quot ;
+
+: literals>cases ( literal-transitions -- case-body )
+ [ execution-quot ] assoc-map ;
: split-literals ( transitions -- case default )
- ! Convert disjunction of literals to literals. Also maybe small ranges.
- >alist [ first integer? ] partition
- [ literals>cases ] [ non-literals>dispatch ] bi* ;
+ { } assoc-like [ first integer? ] partition
+ [ [ literals>cases ] keep ] dip non-literals>dispatch ;
-USING: kernel.private strings sequences.private ;
+: advance ( index backwards? -- index+/-1 )
+ -1 1 ? + >fixnum ; inline
-:: step ( index str case-body final? -- match? )
- index str bounds-check? [
- index 1+ str
+: check ( index string backwards? -- in-bounds? )
+ [ drop -1 eq? not ] [ length < ] if ; inline
+
+:: step ( last-match index str quot final? backwards? -- last-index/f )
+ final? index last-match ?
+ index str backwards? check [
+ index backwards? advance str
index str nth-unsafe
- case-body case
- ] [ final? ] if ; inline
+ quot call
+ ] when ; inline
: transitions>quot ( transitions final-state? -- quot )
- [ split-literals suffix ] dip
- '[ { array-capacity string } declare _ _ step ] ;
+ dup shortest? get and [ 2drop [ drop nip ] ] [
+ [ split-literals swap case>quot ] dip backwards? get
+ '[ { fixnum string } declare _ _ _ step ]
+ ] if ;
: word>quot ( word dfa -- quot )
[ transitions>> at ]
: states>code ( words dfa -- )
'[
- [
- dup _ word>quot
- (( index string -- ? )) define-declared
- ] each
- ] with-compilation-unit ;
-
-: transitions-at ( transitions assoc -- new-transitions )
- dup '[
- [ _ at ]
- [ [ _ at ] assoc-map ] bi*
- ] assoc-map ;
+ dup _ word>quot
+ (( last-match index string -- ? ))
+ define-declared
+ ] each ;
: states>words ( dfa -- words dfa )
dup transitions>> keys [ gensym ] H{ } map>assoc
- [ [ transitions-at ] rewrite-transitions ]
+ [ transitions-at ]
[ values ]
bi swap ;
-: dfa>word ( dfa -- word )
+: dfa>main-word ( dfa -- word )
states>words [ states>code ] keep start-state>> ;
-: run-regexp ( string word -- ? )
- [ 0 ] 2dip execute ; inline
+: word-template ( quot -- quot' )
+ '[ drop [ f ] 2dip over array-capacity? _ [ 2drop ] if ] ;
+
+PRIVATE>
+
+: dfa>word ( dfa -- quot )
+ dfa>main-word execution-quot word-template
+ (( start-index string regexp -- i/f )) define-temp ;
+
+: dfa>shortest-word ( dfa -- word )
+ t shortest? [ dfa>word ] with-variable ;
+
+: dfa>reverse-word ( dfa -- word )
+ t backwards? [ dfa>word ] with-variable ;
-: regexp>quotation ( regexp -- quot )
- compile-regexp dfa>> dfa>word '[ _ run-regexp ] ;
+: dfa>reverse-shortest-word ( dfa -- word )
+ t backwards? [ dfa>shortest-word ] with-variable ;