! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: regexp.classes kernel sequences regexp.negation
-quotations regexp.minimize assocs fry math locals combinators
-accessors words compiler.units kernel.private strings
-sequences.private arrays regexp.matchers call namespaces
-regexp.transition-tables combinators.short-circuit ;
+USING: accessors assocs combinators combinators.short-circuit
+kernel kernel.private math namespaces quotations regexp.classes
+regexp.transition-tables sequences sequences.private sets
+strings unicode words ;
IN: regexp.compiler
GENERIC: question>quot ( question -- quot )
-<PRIVATE
-
SYMBOL: shortest?
SYMBOL: backwards?
+<PRIVATE
+
M: t question>quot drop [ 2drop t ] ;
+M: f question>quot drop [ 2drop f ] ;
M: beginning-of-input question>quot
drop [ drop zero? ] ;
} 2&&
] ;
-M: $ question>quot
+M: $crlf question>quot
drop [ { [ length = ] [ ?nth "\r\n" member? ] } 2|| ] ;
-M: ^ question>quot
- drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
+M: ^crlf question>quot
+ drop [ { [ drop zero? ] [ [ 1 - ] dip ?nth "\r\n" member? ] } 2|| ] ;
+
+M: $unix question>quot
+ drop [ { [ length = ] [ ?nth CHAR: \n = ] } 2|| ] ;
+
+M: ^unix question>quot
+ drop [ { [ drop zero? ] [ [ 1 - ] dip ?nth CHAR: \n = ] } 2|| ] ;
+
+M: word-break question>quot
+ drop [ word-break-at? ] ;
: (execution-quot) ( next-state -- quot )
! The conditions here are for lookaround and anchors, etc
[ question>> question>quot ] [ yes>> ] [ no>> ] tri
[ (execution-quot) ] bi@
'[ 2dup @ _ _ if ]
- ] [ '[ _ execute ] ] if ;
+ ] [ 1quotation ] if ;
: execution-quot ( next-state -- quot )
dup sequence? [ first ] when
: literals>cases ( literal-transitions -- case-body )
[ execution-quot ] assoc-map ;
-: expand-one-or ( or-class transition -- alist )
- [ seq>> ] dip '[ _ 2array ] map ;
-
-: expand-or ( alist -- new-alist )
- [
- first2 over or-class?
- [ expand-one-or ] [ 2array 1array ] if
- ] map concat ;
-
: split-literals ( transitions -- case default )
- >alist expand-or [ first integer? ] partition
+ { } assoc-like [ first integer? ] partition
[ [ literals>cases ] keep ] dip non-literals>dispatch ;
-:: step ( last-match index str quot final? direction -- last-index/f )
+: advance ( index backwards? -- index+/-1 )
+ -1 1 ? + >fixnum ; inline
+
+: check ( index string backwards? -- in-bounds? )
+ [ drop -1 eq? not ] [ length < ] if ; inline
+
+:: step ( last-match index str quot final? backwards? -- last-index/f )
final? index last-match ?
- index str bounds-check? [
- index direction + str
+ index str backwards? check [
+ index backwards? advance str
index str nth-unsafe
quot call
] when ; inline
-: direction ( -- n )
- backwards? get -1 1 ? ;
-
: transitions>quot ( transitions final-state? -- quot )
dup shortest? get and [ 2drop [ drop nip ] ] [
- [ split-literals swap case>quot ] dip direction
- '[ { array-capacity string } declare _ _ _ step ]
+ [ split-literals swap case>quot ] dip backwards? get
+ '[ { fixnum string } declare _ _ _ step ]
] if ;
: word>quot ( word dfa -- quot )
[ transitions>> at ]
- [ final-states>> key? ] 2bi
+ [ final-states>> in? ] 2bi
transitions>quot ;
: states>code ( words dfa -- )
- [ ! with-compilation-unit doesn't compile, so we need call( -- )
- [
- '[
- dup _ word>quot
- (( last-match index string -- ? ))
- define-declared
- ] each
- ] with-compilation-unit
- ] call( words dfa -- ) ;
+ '[
+ dup _ word>quot
+ ( last-match index string -- ? )
+ define-declared
+ ] each ;
: states>words ( dfa -- words dfa )
dup transitions>> keys [ gensym ] H{ } map>assoc
[ transitions-at ]
[ values ]
- bi swap ;
+ bi swap ;
-: dfa>word ( dfa -- word )
+: dfa>main-word ( dfa -- word )
states>words [ states>code ] keep start-state>> ;
-: check-string ( string -- string )
- ! Make this configurable
- dup string? [ "String required" throw ] unless ;
-
-: setup-regexp ( start-index string -- f start-index string )
- [ f ] [ >fixnum ] [ check-string ] tri* ; inline
+: word-template ( quot -- quot' )
+ '[ drop [ f ] 2dip over array-capacity? _ [ 2drop ] if ] ;
PRIVATE>
-! The quotation returned is ( start-index string -- i/f )
-
-: dfa>quotation ( dfa -- quot )
- dfa>word execution-quot '[ setup-regexp @ ] ;
-
-: dfa>shortest-quotation ( dfa -- quot )
- t shortest? [ dfa>quotation ] with-variable ;
-
-: dfa>reverse-quotation ( dfa -- quot )
- t backwards? [ dfa>quotation ] with-variable ;
+: dfa>word ( dfa -- quot )
+ dfa>main-word execution-quot word-template
+ ( start-index string regexp -- i/f ) define-temp ;
-: dfa>reverse-shortest-quotation ( dfa -- quot )
- t backwards? [ dfa>shortest-quotation ] with-variable ;
+: dfa>shortest-word ( dfa -- word )
+ t shortest? [ dfa>word ] with-variable ;
-TUPLE: quot-matcher quot ;
-C: <quot-matcher> quot-matcher
+: dfa>reverse-word ( dfa -- word )
+ t backwards? [ dfa>word ] with-variable ;
-M: quot-matcher match-index-from
- quot>> call( index string -- i/f ) ;
+: dfa>reverse-shortest-word ( dfa -- word )
+ t backwards? [ dfa>shortest-word ] with-variable ;