1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators kernel math
4 sequences namespaces locals combinators.lib state-tables
5 math.parser state-parser sets dlists unicode.categories
6 math.order quotations shuffle math.ranges splitting
7 symbols fry parser math.ranges inspector strings ;
10 SYMBOLS: eps start-state final-state beginning-of-text
11 end-of-text left-parenthesis alternation left-bracket
12 caret dash ampersand colon ;
14 SYMBOL: runtime-epsilon
16 TUPLE: regexp raw parentheses-count bracket-count
17 state stack nfa new-states dfa minimized-dfa
18 dot-matches-newlines? capture-group captured-groups ;
20 TUPLE: capture-group n range ;
22 ERROR: parentheses-underflow ;
23 ERROR: unbalanced-parentheses ;
24 ERROR: unbalanced-brackets ;
26 : push-stack ( regexp token -- ) swap stack>> push ;
27 : push-all-stack ( regexp seq -- ) swap stack>> push-all ;
28 : next-state ( regexp -- n ) [ 1+ ] change-state state>> ;
30 : check-parentheses-underflow ( regexp -- )
31 parentheses-count>> 0 < [ parentheses-underflow ] when ;
33 : check-unbalanced-parentheses ( regexp -- )
34 parentheses-count>> 0 > [ unbalanced-parentheses ] when ;
36 :: (apply-alternation) ( stack regexp -- )
37 [let | s2 [ stack peek first ]
38 s3 [ stack pop second ]
39 s0 [ stack peek alternation = [ stack pop* ] when stack peek first ]
40 s1 [ stack pop second ]
41 s4 [ regexp next-state ]
42 s5 [ regexp next-state ]
43 table [ regexp nfa>> ] |
45 s4 eps s0 <entry> table add-entry
46 s4 eps s2 <entry> table add-entry
47 s1 eps s5 <entry> table add-entry
48 s3 eps s5 <entry> table add-entry
49 s1 table final-states>> delete-at
50 s3 table final-states>> delete-at
51 t s5 table final-states>> set-at
52 s4 s5 2array stack push ] ;
54 : apply-alternation ( regexp -- )
55 [ stack>> ] [ (apply-alternation) ] bi ;
57 : apply-alternation? ( stack -- ? )
59 [ 2drop f ] [ 2 - swap nth alternation = ] if ;
61 :: (apply-concatenation) ( stack regexp -- )
63 s2 [ stack peek first ]
64 s3 [ stack pop second ]
65 s0 [ stack peek first ]
66 s1 [ stack pop second ]
67 table [ regexp nfa>> ] |
68 s1 eps s2 <entry> table set-entry
69 s1 table final-states>> delete-at
71 s0 s3 2array stack push ] ;
73 : apply-concatenation ( regexp -- )
74 [ stack>> ] [ (apply-concatenation) ] bi ;
76 : apply-concatenation? ( seq -- ? )
78 [ 2drop f ] [ 2 - swap nth array? ] if ;
80 : apply-loop ( seq regexp -- seq regexp )
82 2dup over apply-alternation?
83 [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop
86 : cut-out ( vector n -- vector' vector ) cut rest ;
88 : cut-stack ( obj vector -- vector' vector )
89 tuck last-index cut-out swap ;
91 : apply-til-last ( regexp token -- )
92 swap [ cut-stack ] change-stack
93 apply-loop stack>> push-all ;
95 : concatenation-loop ( regexp -- )
96 dup stack>> dup apply-concatenation?
97 [ over (apply-concatenation) concatenation-loop ] [ 2drop ] if ;
99 :: apply-kleene-closure ( regexp -- )
100 [let* | stack [ regexp stack>> ]
101 s0 [ stack peek first ]
102 s1 [ stack pop second ]
103 s2 [ regexp next-state ]
104 s3 [ regexp next-state ]
105 table [ regexp nfa>> ] |
106 s1 table final-states>> delete-at
107 t s3 table final-states>> set-at
109 s1 eps s0 <entry> table add-entry
110 s2 eps s0 <entry> table add-entry
111 s2 eps s3 <entry> table add-entry
112 s1 eps s3 <entry> table add-entry
113 s2 s3 2array stack push ] ;
115 : add-numbers ( n obj -- obj )
116 2dup [ number? ] bi@ and
117 [ + ] [ dup sequence? [ [ + ] with map ] [ nip ] if ] if ;
119 : increment-columns ( n assoc -- )
120 dup [ >r swap >r add-numbers r> r> set-at ] curry with* assoc-each ;
122 :: copy-state-rows ( regexp range -- )
123 [let* | len [ range range-length ]
124 offset [ regexp state>> range range-min - 1+ ]
125 state [ regexp [ len + ] change-state ] |
127 [ drop range member? ] assoc-filter
130 [ offset swap add-numbers ] assoc-map
132 regexp nfa>> [ assoc-union ] change-rows drop
133 range [ range-min ] [ range-max ] bi [ offset + ] bi@ 2array
134 regexp stack>> push ] ;
136 : last-state ( regexp -- range )
137 stack>> peek first2 [a,b] ;
139 : set-last-state-final ( ? regexp -- )
140 [ stack>> peek second ] [ nfa>> final-states>> ] bi set-at ;
142 : apply-plus-closure ( regexp -- )
143 [ dup last-state copy-state-rows ]
144 [ apply-kleene-closure ]
145 [ apply-concatenation ] tri ;
147 : apply-question-closure ( regexp -- )
148 [ stack>> peek first2 eps swap <entry> ] [ nfa>> add-entry ] bi ;
150 : with0 ( obj n quot -- n quot' ) swapd curry ; inline
153 : range>state ( range -- pair )
154 [ from>> ] [ length>> ] bi over - 2array ;
156 : copy-state ( regexp range n -- )
158 drop range>state over stack>> push apply-question-closure
160 [ copy-state-rows ] with0 with0 times
163 :: (exactly-n) ( regexp state n -- )
164 regexp state n copy-state
165 t regexp set-last-state-final ;
167 : exactly-n ( regexp n -- )
168 >r dup last-state r> 1- (exactly-n) ;
170 : exactly-n-concatenated ( regexp state n -- )
172 [ (exactly-n) ] 3keep
173 nip 1- [ apply-concatenation ] with0 times ;
175 :: at-least-n ( regexp n -- )
176 [let | state [ regexp stack>> pop first2 [a,b] ] |
177 regexp state n copy-state
178 state regexp stack>> push
179 regexp apply-kleene-closure ] ;
181 : peek-last ( regexp -- range )
182 stack>> peek first2 [a,b] ;
184 : pop-last ( regexp -- range )
185 stack>> pop first2 [a,b] ;
187 :: at-most-n ( regexp n -- )
188 [let | state [ regexp pop-last ] |
189 regexp state n [ 1+ exactly-n-concatenated ] with with each
190 regexp n 1- [ apply-alternation ] with0 times
191 regexp apply-question-closure ] ;
193 :: from-m-to-n ( regexp m n -- )
194 [let | state [ regexp pop-last ] |
196 m n [a,b] [ exactly-n-concatenated ] with with each
197 regexp n m - [ apply-alternation ] with0 times ] ;
199 : apply-brace-closure ( regexp from/f to/f comma? -- )
203 [ [ nip at-most-n ] [ at-least-n ] if* ] if
204 ] [ drop exactly-n ] if ;
206 :: push-single-nfa ( regexp obj -- )
207 [let | s0 [ regexp next-state ]
208 s1 [ regexp next-state ]
209 stack [ regexp stack>> ]
210 table [ regexp nfa>> ] |
211 s0 obj s1 <entry> table set-entry
213 t s1 table final-states>> set-at
214 s0 s1 2array stack push ] ;
216 : set-start-state ( regexp -- )
217 dup stack>> dup empty? [
220 [ nfa>> ] [ pop first ] bi* >>start-state drop
223 : ascii? ( n -- ? ) 0 HEX: 7f between? ;
224 : octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
225 : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
227 : hex-digit? ( n -- ? )
229 [ dup decimal-digit? ]
230 [ dup CHAR: a CHAR: f between? ]
231 [ dup CHAR: A CHAR: F between? ]
234 : control-char? ( n -- ? )
236 [ dup 0 HEX: 1f between? ]
241 "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
243 : c-identifier-char? ( ch -- ? )
244 [ [ dup alpha? ] [ dup CHAR: _ = ] ] || nip ;
246 : java-blank? ( n -- ? )
248 CHAR: \s CHAR: \t CHAR: \n
249 HEX: b HEX: 7 CHAR: \r
252 : java-printable? ( n -- ? )
253 [ [ dup alpha? ] [ dup punct? ] ] || nip ;
255 ERROR: bad-character-class obj ;
257 : parse-posix-class ( -- quot )
260 [ get-char CHAR: } = ] take-until
262 { "Lower" [ [ letter? ] ] }
263 { "Upper" [ [ LETTER? ] ] }
264 { "ASCII" [ [ ascii? ] ] }
265 { "Alpha" [ [ Letter? ] ] }
266 { "Digit" [ [ digit? ] ] }
267 { "Alnum" [ [ alpha? ] ] }
268 { "Punct" [ [ punct? ] ] }
269 { "Graph" [ [ java-printable? ] ] }
270 { "Print" [ [ java-printable? ] ] }
271 { "Blank" [ [ " \t" member? ] ] }
272 { "Cntrl" [ [ control-char? ] ] }
273 { "XDigit" [ [ hex-digit? ] ] }
274 { "Space" [ [ java-blank? ] ] }
275 ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
276 [ bad-character-class ]
279 ERROR: bad-octal number ;
281 : parse-octal ( -- n )
284 dup 255 > [ bad-octal ] when ;
286 ERROR: bad-hex number ;
288 : parse-short-hex ( -- n )
290 dup number? [ bad-hex ] unless ;
292 : parse-long-hex ( -- n )
294 dup number? [ bad-hex ] unless ;
296 : parse-control-character ( -- n )
299 : dot-construction ( regexp -- )
300 [ CHAR: \n = not ] push-single-nfa ;
302 : front-anchor-construction ( regexp -- )
305 : back-anchor-construction ( regexp -- )
308 : parse-brace ( -- from/f to/f comma? )
310 [ get-char CHAR: } = ] take-until
311 "," split1 [ [ string>number ] bi@ ] keep >boolean ;
313 TUPLE: character-class members ;
314 TUPLE: character-class-range from to ;
315 TUPLE: negated-character-class < character-class ;
316 TUPLE: negated-character-class-range < character-class-range ;
317 TUPLE: intersection-class < character-class ;
318 TUPLE: negated-intersection-class < intersection-class ;
320 GENERIC: character-class-contains? ( obj character-class -- ? )
322 : parse-escaped-until ( -- seq )
323 [ get-char CHAR: \ = get-next CHAR: E = and ] take-until
326 : character-class-predicate ( seq -- quot )
327 boa '[ , character-class-contains? ] ;
329 ERROR: unmatched-escape-sequence ;
331 : (parse-escaped) ( regexp ? ch -- obj )
333 { CHAR: \ [ [ CHAR: \ = ] ] }
334 { CHAR: t [ [ CHAR: \t = ] ] }
335 { CHAR: n [ [ CHAR: \n = ] ] }
336 { CHAR: r [ [ CHAR: \r = ] ] }
337 { CHAR: f [ [ HEX: c = ] ] }
338 { CHAR: a [ [ HEX: 7 = ] ] }
339 { CHAR: e [ [ HEX: 1b = ] ] }
341 { CHAR: d [ [ digit? ] ] }
342 { CHAR: D [ [ digit? not ] ] }
343 { CHAR: s [ [ java-blank? ] ] }
344 { CHAR: S [ [ java-blank? not ] ] }
345 { CHAR: w [ [ c-identifier-char? ] ] }
346 { CHAR: W [ [ c-identifier-char? not ] ] }
348 { CHAR: p [ parse-posix-class ] }
349 { CHAR: P [ parse-posix-class [ not ] compose ] }
350 { CHAR: x [ parse-short-hex ] }
351 { CHAR: u [ parse-long-hex ] }
352 { CHAR: 0 [ parse-octal ] }
353 { CHAR: c [ parse-control-character ] }
355 ! { CHAR: Q [ next parse-escaped-until ] }
356 ! { CHAR: E [ unmatched-escape-sequence ] }
358 ! { CHAR: b [ ] } ! a word boundary
359 ! { CHAR: B [ ] } ! a non-word boundary
360 ! { CHAR: A [ ] } ! beginning of input
361 ! { CHAR: G [ ] } ! end of previous match
362 ! { CHAR: Z [ ] } ! end of input but for the final terminator, if any
363 ! { CHAR: z [ ] } ! end of the input
367 : parse-escaped ( regexp -- )
368 next get-char (parse-escaped) push-single-nfa ;
370 : handle-dash ( vector -- vector )
371 [ dup dash eq? [ drop CHAR: - ] when ] map ;
373 M: object character-class-contains? ( obj1 obj2 -- ? )
376 M: callable character-class-contains? ( obj1 callable -- ? )
379 M: character-class character-class-contains? ( obj cc -- ? )
380 members>> [ character-class-contains? ] with find drop >boolean ;
382 M: negated-character-class character-class-contains? ( obj cc -- ? )
383 call-next-method not ;
385 M: character-class-range character-class-contains? ( obj cc -- ? )
386 [ from>> ] [ to>> ] bi between? ;
388 M: negated-character-class-range character-class-contains? ( obj cc -- ? )
389 call-next-method not ;
391 M: intersection-class character-class-contains? ( obj cc -- ? )
392 members>> [ character-class-contains? not ] with find drop not ;
394 M: negated-intersection-class character-class-contains? ( obj cc -- ? )
395 call-next-method not ;
397 ERROR: unmatched-negated-character-class class ;
399 : handle-caret ( obj -- seq class )
400 dup [ length 2 >= ] [ first caret eq? ] bi and [
401 rest negated-character-class
406 : make-character-class ( regexp -- )
407 left-bracket over stack>> cut-stack
409 handle-dash handle-caret
410 character-class-predicate push-single-nfa ;
412 : apply-dash ( regexp -- )
413 stack>> dup [ pop ] [ pop* ] [ pop ] tri
414 swap character-class-range boa swap push ;
416 : apply-dash? ( regexp -- ? )
417 stack>> dup length 3 >=
418 [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
420 DEFER: parse-character-class
421 : (parse-character-class) ( regexp -- )
426 [ 1+ ] change-bracket-count dup left-bracket push-stack
427 parse-character-class
430 [ 1- ] change-bracket-count
433 { CHAR: - [ dash push-stack ] }
434 ! { CHAR: & [ ampersand push-stack ] }
435 ! { CHAR: : [ semicolon push-stack ] }
436 { CHAR: \ [ next get-char (parse-escaped) push-stack ] }
437 { f [ unbalanced-brackets ] }
438 [ dupd push-stack dup apply-dash? [ apply-dash ] [ drop ] if ]
441 dup bracket-count>> 0 >
442 [ (parse-character-class) ] [ drop ] if
445 : parse-character-class-second ( regexp -- )
448 { CHAR: [ [ CHAR: [ push-stack next ] }
449 { CHAR: ] [ CHAR: ] push-stack next ] }
450 { CHAR: - [ CHAR: - push-stack next ] }
454 : parse-character-class-first ( regexp -- )
457 { CHAR: ^ [ caret dupd push-stack next parse-character-class-second ] }
458 { CHAR: [ [ CHAR: [ push-stack next ] }
459 { CHAR: ] [ CHAR: ] push-stack next ] }
460 { CHAR: - [ CHAR: - push-stack next ] }
464 : parse-character-class ( regexp -- )
465 [ parse-character-class-first ] [ (parse-character-class) ] bi ;
467 ERROR: unsupported-token token ;
468 : parse-token ( regexp token -- )
470 { CHAR: ^ [ drop front-anchor-construction ] }
471 { CHAR: $ [ drop back-anchor-construction ] }
472 { CHAR: \ [ drop parse-escaped ] }
473 { CHAR: | [ drop dup concatenation-loop alternation push-stack ] }
474 { CHAR: ( [ drop [ 1+ ] change-parentheses-count left-parenthesis push-stack ] }
475 { CHAR: ) [ drop [ 1- ] change-parentheses-count left-parenthesis apply-til-last ] }
476 { CHAR: * [ drop apply-kleene-closure ] }
477 { CHAR: + [ drop apply-plus-closure ] }
478 { CHAR: ? [ drop apply-question-closure ] }
479 { CHAR: { [ drop parse-brace apply-brace-closure ] }
482 dup left-bracket push-stack
483 [ 1+ ] change-bracket-count parse-character-class
485 ! { CHAR: } [ drop drop "brace" ] }
486 { CHAR: . [ drop dot-construction ] }
487 { beginning-of-text [ push-stack ] }
490 [ check-unbalanced-parentheses ]
491 [ concatenation-loop ]
492 [ beginning-of-text apply-til-last ]
496 [ drop push-single-nfa ]
499 : (parse-raw-regexp) ( regexp -- )
500 get-char [ dupd parse-token next (parse-raw-regexp) ] [ drop ] if* ;
502 : parse-raw-regexp ( regexp -- )
503 [ beginning-of-text parse-token ]
505 dup raw>> dup empty? [
508 [ (parse-raw-regexp) ] string-parse
511 [ end-of-text parse-token ] tri ;
513 :: find-delta ( states obj table -- keys )
516 [ dup integer? [ 1array ] when unique ] [ H{ } ] if*
517 ] with map H{ } clone [ assoc-union ] reduce keys ;
519 :: (find-closure) ( states obj assoc table -- keys )
520 [let | size [ assoc assoc-size ] |
521 assoc states unique assoc-union
522 dup assoc-size size > [
525 dup integer? [ 1array ] when
526 obj rot table (find-closure)
533 : find-closure ( states obj table -- states )
534 >r H{ } r> (find-closure) keys ;
536 : find-epsilon-closure ( states table -- states )
537 >r eps H{ } r> (find-closure) keys ;
539 : filter-special-transition ( vec -- vec' )
540 [ drop eps = not ] assoc-filter ;
542 : initialize-subset-construction ( regexp -- )
545 nfa>> [ start-state>> 1array ] keep
546 find-epsilon-closure 1dlist
548 swap >>new-states drop
550 [ dfa>> ] [ nfa>> ] bi
551 columns>> filter-special-transition >>columns drop
554 :: (subset-construction) ( regexp -- )
555 [let* | nfa [ regexp nfa>> ]
557 new-states [ regexp new-states>> ]
558 columns [ dfa columns>> keys ] |
560 new-states dlist-empty? [
564 2dup nfa [ find-delta ] [ find-epsilon-closure ] bi
565 dup [ dfa rows>> key? ] [ empty? ] bi or [
566 dup new-states push-back
568 dup empty? [ 3drop ] [ <entry> dfa set-entry ] if
570 regexp (subset-construction)
573 : set-start/final-states ( regexp -- )
574 dup [ nfa>> start-state>> ]
575 [ dfa>> rows>> keys [ member? ] with filter first ] bi
576 >r dup dfa>> r> >>start-state drop
578 dup [ nfa>> final-states>> ] [ dfa>> rows>> ] bi
580 [ intersect empty? not ] with filter
581 >r dfa>> r> >>final-states drop ;
583 : subset-construction ( regexp -- )
584 [ initialize-subset-construction ]
585 [ (subset-construction) ]
586 [ set-start/final-states ] tri ;
588 : <regexp> ( raw -- obj )
591 0 >>parentheses-count
596 dup [ parse-raw-regexp ] [ subset-construction ] bi ;
598 ! Literal syntax for regexps
599 : parse-options ( string -- ? )
606 : parse-regexp ( accum end -- accum )
607 lexer get dup skip-blank
608 [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
609 ! lexer get dup still-parsing-line?
610 ! [ (parse-token) parse-options ] [ drop f ] if
613 : R! CHAR: ! parse-regexp ; parsing
614 : R" CHAR: " parse-regexp ; parsing
615 : R# CHAR: # parse-regexp ; parsing
616 : R' CHAR: ' parse-regexp ; parsing
617 : R( CHAR: ) parse-regexp ; parsing
618 : R/ CHAR: / parse-regexp ; parsing
619 : R@ CHAR: @ parse-regexp ; parsing
620 : R[ CHAR: ] parse-regexp ; parsing
621 : R` CHAR: ` parse-regexp ; parsing
622 : R{ CHAR: } parse-regexp ; parsing
623 : R| CHAR: | parse-regexp ; parsing
627 last-state current-state
629 start-index current-index
632 : <dfa-traverser> ( text dfa -- match )
635 swap [ start-state>> >>current-state ] keep
640 V{ } clone >>matches ;
642 : final-state? ( dfa-traverser -- ? )
643 [ current-state>> ] [ dfa>> final-states>> ] bi
646 : text-finished? ( dfa-traverser -- ? )
647 [ current-index>> ] [ text>> length ] bi >= ;
649 : save-final-state ( dfa-straverser -- )
650 [ current-index>> ] [ matches>> ] bi push ;
652 : match-done? ( dfa-traverser -- ? )
655 ] when text-finished? ;
657 : increment-state ( dfa-traverser state -- dfa-traverser )
658 >r [ 1+ ] change-current-index
659 dup current-state>> >>last-state r>
662 : match-transition ( obj hash -- state/f )
663 2dup keys [ callable? ] filter predicates
664 [ swap at nip ] [ at ] if* ;
666 : do-match ( dfa-traverser -- dfa-traverser )
674 at >r nth r> match-transition [
675 increment-state do-match
679 : return-match ( dfa-traverser -- interval/f )
680 dup matches>> empty? [
683 [ start-index>> ] [ matches>> peek ] bi 1 <range>
686 : match ( string regexp -- pair )
687 <dfa-traverser> do-match return-match ;
689 : matches? ( string regexp -- ? )
690 dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
692 : match-head ( string regexp -- end )
696 ! TUPLE: range-class from to ;
697 ! TUPLE: or-class left right ;
699 ! (?:a|b)* <- does not capture
700 ! (a|b)*\1 <- group captured
701 ! doesn't advance the current position:
702 ! (?=abba) positive lookahead matches abbaaa but not abaaa
703 ! (?!abba) negative lookahead matches ababa but not abbaa
704 ! look behind. "lookaround"
706 ! : $ ( n -- obj ) groups get nth ;
708 ! groups bound to scope here
712 ! match in a string with .*foo.*