unmaintained/regexp4/regexp4.factor

   1 ! Copyright (C) 2008 Doug Coleman.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors arrays assocs combinators kernel math
   4 sequences namespaces locals combinators.lib state-tables
   5 math.parser state-parser sets dlists unicode.categories
   6 math.order quotations shuffle math.ranges splitting
   7 symbols fry parser math.ranges inspector strings ;
   8 IN: regexp4
   9
  10 SYMBOLS: eps start-state final-state beginning-of-text
  11 end-of-text left-parenthesis alternation left-bracket
  12 caret dash ampersand colon ;
  13
  14 SYMBOL: runtime-epsilon
  15
  16 TUPLE: regexp raw parentheses-count bracket-count
  17 state stack nfa new-states dfa minimized-dfa
  18 dot-matches-newlines? capture-group captured-groups ;
  19
  20 TUPLE: capture-group n range ;
  21
  22 ERROR: parentheses-underflow ;
  23 ERROR: unbalanced-parentheses ;
  24 ERROR: unbalanced-brackets ;
  25
  26 : push-stack ( regexp token -- ) swap stack>> push ;
  27 : push-all-stack ( regexp seq -- ) swap stack>> push-all ;
  28 : next-state ( regexp -- n ) [ 1+ ] change-state state>> ;
  29
  30 : check-parentheses-underflow ( regexp -- )
  31     parentheses-count>> 0 < [ parentheses-underflow ] when ;
  32
  33 : check-unbalanced-parentheses ( regexp -- )
  34     parentheses-count>> 0 > [ unbalanced-parentheses ] when ;
  35
  36 :: (apply-alternation) ( stack regexp -- )
  37     [let | s2 [ stack peek first ]
  38            s3 [ stack pop second ]
  39            s0 [ stack peek alternation = [ stack pop* ] when stack peek first ]
  40            s1 [ stack pop second ]
  41            s4 [ regexp next-state ]
  42            s5 [ regexp next-state ]
  43            table [ regexp nfa>> ] |
  44         s5 table add-row
  45         s4 eps s0 <entry> table add-entry
  46         s4 eps s2 <entry> table add-entry
  47         s1 eps s5 <entry> table add-entry
  48         s3 eps s5 <entry> table add-entry
  49         s1 table final-states>> delete-at
  50         s3 table final-states>> delete-at
  51         t s5 table final-states>> set-at
  52         s4 s5 2array stack push ] ;
  53
  54 : apply-alternation ( regexp -- )
  55     [ stack>> ] [ (apply-alternation) ] bi ;
  56
  57 : apply-alternation? ( stack -- ? )
  58     dup length dup 3 <
  59     [ 2drop f ] [ 2 - swap nth alternation = ] if ;
  60
  61 :: (apply-concatenation) ( stack regexp -- )
  62     [let* |
  63             s2 [ stack peek first ]
  64             s3 [ stack pop second ]
  65             s0 [ stack peek first ]
  66             s1 [ stack pop second ]
  67             table [ regexp nfa>> ] |
  68         s1 eps s2 <entry> table set-entry
  69         s1 table final-states>> delete-at
  70         s3 table add-row
  71         s0 s3 2array stack push ] ;
  72
  73 : apply-concatenation ( regexp -- )
  74     [ stack>> ] [ (apply-concatenation) ] bi ;
  75
  76 : apply-concatenation? ( seq -- ? )
  77     dup length dup 2 <
  78     [ 2drop f ] [ 2 - swap nth array? ] if ;
  79
  80 : apply-loop ( seq regexp -- seq regexp )
  81     over length 1 > [
  82         2dup over apply-alternation?
  83         [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop
  84     ] when ;
  85
  86 : cut-out ( vector n -- vector' vector ) cut rest ;
  87
  88 : cut-stack ( obj vector -- vector' vector )
  89     tuck last-index cut-out swap ;
  90
  91 : apply-til-last ( regexp token -- )
  92     swap [ cut-stack ] change-stack
  93     apply-loop stack>> push-all ;
  94
  95 : concatenation-loop ( regexp -- )
  96     dup stack>> dup apply-concatenation?
  97     [ over (apply-concatenation) concatenation-loop ] [ 2drop ] if ;
  98
  99 :: apply-kleene-closure ( regexp -- )
 100     [let* | stack [ regexp stack>> ]
 101             s0 [ stack peek first ]
 102             s1 [ stack pop second ]
 103             s2 [ regexp next-state ]
 104             s3 [ regexp next-state ]
 105             table [ regexp nfa>> ] |
 106         s1 table final-states>> delete-at
 107         t s3 table final-states>> set-at
 108         s3 table add-row
 109         s1 eps s0 <entry> table add-entry
 110         s2 eps s0 <entry> table add-entry
 111         s2 eps s3 <entry> table add-entry
 112         s1 eps s3 <entry> table add-entry
 113         s2 s3 2array stack push ] ;
 114
 115 : add-numbers ( n obj -- obj )
 116     2dup [ number? ] bi@ and
 117     [ + ] [ dup sequence? [ [ + ] with map ] [ nip ] if ] if ;
 118
 119 : increment-columns ( n assoc -- )
 120     dup [ >r swap >r add-numbers r> r> set-at ] curry with* assoc-each ;
 121
 122 :: copy-state-rows ( regexp range -- )
 123     [let* | len [ range range-length ]
 124             offset [ regexp state>> range range-min - 1+ ]
 125             state [ regexp [ len + ] change-state ] |
 126         regexp nfa>> rows>>
 127         [ drop range member? ] assoc-filter
 128         [
 129             [ offset + ] dip
 130             [ offset swap add-numbers ] assoc-map
 131         ] assoc-map
 132         regexp nfa>> [ assoc-union ] change-rows drop
 133         range [ range-min ] [ range-max ] bi [ offset + ] bi@ 2array
 134         regexp stack>> push ] ;
 135
 136 : last-state ( regexp -- range )
 137     stack>> peek first2 [a,b] ;
 138
 139 : set-last-state-final ( ? regexp -- )
 140     [ stack>> peek second ] [ nfa>> final-states>> ] bi set-at ;
 141
 142 : apply-plus-closure ( regexp -- )
 143     [ dup last-state copy-state-rows ]
 144     [ apply-kleene-closure ]
 145     [ apply-concatenation ] tri ;
 146
 147 : apply-question-closure ( regexp -- )
 148     [ stack>> peek first2 eps swap <entry> ] [ nfa>> add-entry ] bi ;
 149
 150 : with0 ( obj n quot -- n quot' ) swapd curry ; inline
 151
 152
 153 : range>state ( range -- pair )
 154     [ from>> ] [ length>> ] bi over - 2array ;
 155
 156 : copy-state ( regexp range n -- )
 157     dup zero? [
 158         drop range>state over stack>> push apply-question-closure
 159     ] [
 160         [ copy-state-rows ] with0 with0 times
 161     ] if ;
 162
 163 :: (exactly-n) ( regexp state n -- )
 164     regexp state n copy-state
 165     t regexp set-last-state-final ;
 166
 167 : exactly-n ( regexp n -- )
 168     >r dup last-state r> 1- (exactly-n) ;
 169
 170 : exactly-n-concatenated ( regexp state n -- )
 171 B
 172     [ (exactly-n) ] 3keep
 173     nip 1- [ apply-concatenation ] with0 times ;
 174
 175 :: at-least-n ( regexp n -- )
 176     [let | state [ regexp stack>> pop first2 [a,b] ] |
 177         regexp state n copy-state
 178         state regexp stack>> push
 179         regexp apply-kleene-closure ] ;
 180
 181 : peek-last ( regexp -- range )
 182     stack>> peek first2 [a,b] ;
 183
 184 : pop-last ( regexp -- range )
 185     stack>> pop first2 [a,b] ;
 186
 187 :: at-most-n ( regexp n -- )
 188     [let | state [ regexp pop-last ] |
 189         regexp state n [ 1+ exactly-n-concatenated ] with with each
 190         regexp n 1- [ apply-alternation ] with0 times
 191         regexp apply-question-closure ] ;
 192
 193 :: from-m-to-n ( regexp m n -- )
 194     [let | state [ regexp pop-last ] |
 195         regexp state
 196         m n [a,b] [ exactly-n-concatenated ] with with each
 197         regexp n m - [ apply-alternation ] with0 times ] ;
 198
 199 : apply-brace-closure ( regexp from/f to/f comma? -- )
 200     [
 201         2dup and
 202         [ from-m-to-n ]
 203         [ [ nip at-most-n ] [ at-least-n ] if* ] if
 204     ] [ drop exactly-n ] if ;
 205
 206 :: push-single-nfa ( regexp obj -- )
 207     [let | s0 [ regexp next-state ]
 208            s1 [ regexp next-state ]
 209            stack [ regexp stack>> ]
 210            table [ regexp nfa>> ] |
 211         s0 obj s1 <entry> table set-entry
 212         s1 table add-row
 213         t s1 table final-states>> set-at
 214         s0 s1 2array stack push ] ;
 215
 216 : set-start-state ( regexp -- )
 217     dup stack>> dup empty? [
 218         2drop
 219     ] [
 220         [ nfa>> ] [ pop first ] bi* >>start-state drop
 221     ] if ;
 222
 223 : ascii? ( n -- ? ) 0 HEX: 7f between? ;
 224 : octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
 225 : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
 226
 227 : hex-digit? ( n -- ? )
 228     [
 229         [ dup decimal-digit? ]
 230         [ dup CHAR: a CHAR: f between? ]
 231         [ dup CHAR: A CHAR: F between? ]
 232     ] || nip ;
 233
 234 : control-char? ( n -- ? )
 235     [
 236         [ dup 0 HEX: 1f between? ]
 237         [ dup HEX: 7f = ]
 238     ] || nip ;
 239
 240 : punct? ( n -- ? )
 241     "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
 242
 243 : c-identifier-char? ( ch -- ? )
 244     [ [ dup alpha? ] [ dup CHAR: _ = ] ] || nip ;
 245
 246 : java-blank? ( n -- ? )
 247     {
 248         CHAR: \s CHAR: \t CHAR: \n
 249         HEX: b HEX: 7 CHAR: \r
 250     } member? ;
 251
 252 : java-printable? ( n -- ? )
 253     [ [ dup alpha? ] [ dup punct? ] ] || nip ;
 254
 255 ERROR: bad-character-class obj ;
 256
 257 : parse-posix-class ( -- quot )
 258     next
 259     CHAR: { expect
 260     [ get-char CHAR: } = ] take-until
 261     {
 262         { "Lower" [ [ letter? ] ] }
 263         { "Upper" [ [ LETTER? ] ] }
 264         { "ASCII" [ [ ascii? ] ] }
 265         { "Alpha" [ [ Letter? ] ] }
 266         { "Digit" [ [ digit? ] ] }
 267         { "Alnum" [ [ alpha? ] ] }
 268         { "Punct" [ [ punct? ] ] }
 269         { "Graph" [ [ java-printable? ] ] }
 270         { "Print" [ [ java-printable? ] ] }
 271         { "Blank" [ [ " \t" member? ] ] }
 272         { "Cntrl" [ [ control-char? ] ] }
 273         { "XDigit" [ [ hex-digit? ] ] }
 274         { "Space" [ [ java-blank? ] ] }
 275         ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
 276         [ bad-character-class ]
 277     } case ;
 278
 279 ERROR: bad-octal number ;
 280
 281 : parse-octal ( -- n )
 282     next get-char drop
 283     3 take oct>
 284     dup 255 > [ bad-octal ] when ;
 285
 286 ERROR: bad-hex number ;
 287
 288 : parse-short-hex ( -- n )
 289     next 2 take hex>
 290     dup number? [ bad-hex ] unless ;
 291
 292 : parse-long-hex ( -- n )
 293     next 6 take hex>
 294     dup number? [ bad-hex ] unless ;
 295
 296 : parse-control-character ( -- n )
 297     next get-char ;
 298
 299 : dot-construction ( regexp -- )
 300     [ CHAR: \n = not ] push-single-nfa ;
 301
 302 : front-anchor-construction ( regexp -- )
 303     drop ;
 304
 305 : back-anchor-construction  ( regexp -- )
 306     drop ;
 307
 308 : parse-brace ( -- from/f to/f comma? )
 309     next
 310     [ get-char CHAR: } = ] take-until
 311     "," split1 [ [ string>number ] bi@ ] keep >boolean ;
 312
 313 TUPLE: character-class members ;
 314 TUPLE: character-class-range from to ;
 315 TUPLE: negated-character-class < character-class ;
 316 TUPLE: negated-character-class-range < character-class-range ;
 317 TUPLE: intersection-class < character-class ;
 318 TUPLE: negated-intersection-class < intersection-class ;
 319
 320 GENERIC: character-class-contains? ( obj character-class -- ? )
 321
 322 : parse-escaped-until ( -- seq )
 323     [ get-char CHAR: \ = get-next CHAR: E = and ] take-until
 324     next ;
 325
 326 : character-class-predicate ( seq -- quot )
 327     boa '[ , character-class-contains? ] ;
 328
 329 ERROR: unmatched-escape-sequence ;
 330
 331 : (parse-escaped) ( regexp ? ch -- obj )
 332     {
 333         { CHAR: \ [ [ CHAR: \ = ] ] }
 334         { CHAR: t [ [ CHAR: \t = ] ] }
 335         { CHAR: n [ [ CHAR: \n = ] ] }
 336         { CHAR: r [ [ CHAR: \r = ] ] }
 337         { CHAR: f [ [ HEX: c = ] ] }
 338         { CHAR: a [ [ HEX: 7 = ] ] }
 339         { CHAR: e [ [ HEX: 1b = ] ] }
 340
 341         { CHAR: d [ [ digit? ] ] }
 342         { CHAR: D [ [ digit? not ] ] }
 343         { CHAR: s [ [ java-blank? ] ] }
 344         { CHAR: S [ [ java-blank? not ] ] }
 345         { CHAR: w [ [ c-identifier-char? ] ] }
 346         { CHAR: W [ [ c-identifier-char? not ] ] }
 347
 348         { CHAR: p [ parse-posix-class ] }
 349         { CHAR: P [ parse-posix-class [ not ] compose ] }
 350         { CHAR: x [ parse-short-hex ] }
 351         { CHAR: u [ parse-long-hex ] }
 352         { CHAR: 0 [ parse-octal ] }
 353         { CHAR: c [ parse-control-character ] }
 354
 355         ! { CHAR: Q [ next parse-escaped-until ] }
 356         ! { CHAR: E [ unmatched-escape-sequence ] }
 357
 358         ! { CHAR: b [ ] } ! a word boundary
 359         ! { CHAR: B [ ] } ! a non-word boundary
 360         ! { CHAR: A [ ] } ! beginning of input
 361         ! { CHAR: G [ ] } ! end of previous match
 362         ! { CHAR: Z [ ] } ! end of input but for the final terminator, if any
 363         ! { CHAR: z [ ] } ! end of the input
 364         [ ]
 365     } case ;
 366
 367 : parse-escaped ( regexp -- )
 368     next get-char (parse-escaped) push-single-nfa ;
 369
 370 : handle-dash ( vector -- vector )
 371     [ dup dash eq? [ drop CHAR: - ] when ] map ;
 372
 373 M: object character-class-contains? ( obj1 obj2 -- ? )
 374     = ;
 375
 376 M: callable character-class-contains? ( obj1 callable -- ? )
 377     call ;
 378
 379 M: character-class character-class-contains? ( obj cc -- ? )
 380     members>> [ character-class-contains? ] with find drop >boolean ;
 381
 382 M: negated-character-class character-class-contains? ( obj cc -- ? )
 383     call-next-method not ;
 384
 385 M: character-class-range character-class-contains? ( obj cc -- ? )
 386     [ from>> ] [ to>> ] bi between?  ;
 387
 388 M: negated-character-class-range character-class-contains? ( obj cc -- ? )
 389     call-next-method not ;
 390
 391 M: intersection-class character-class-contains? ( obj cc -- ? )
 392     members>> [ character-class-contains? not ] with find drop not ;
 393
 394 M: negated-intersection-class character-class-contains? ( obj cc -- ? )
 395     call-next-method not ;
 396
 397 ERROR: unmatched-negated-character-class class ;
 398
 399 : handle-caret ( obj -- seq class )
 400     dup [ length 2 >= ] [ first caret eq? ] bi and [
 401         rest negated-character-class
 402     ] [
 403         character-class
 404     ] if ;
 405
 406 : make-character-class ( regexp -- )
 407     left-bracket over stack>> cut-stack
 408     pick (>>stack)
 409     handle-dash handle-caret
 410     character-class-predicate push-single-nfa ;
 411
 412 : apply-dash ( regexp -- )
 413     stack>> dup [ pop ] [ pop* ] [ pop ] tri
 414     swap character-class-range boa swap push ;
 415
 416 : apply-dash? ( regexp -- ? )
 417     stack>> dup length 3 >=
 418     [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
 419
 420 DEFER: parse-character-class
 421 : (parse-character-class) ( regexp -- )
 422     [
 423         next get-char
 424         {
 425             { CHAR: [ [
 426                 [ 1+ ] change-bracket-count dup left-bracket push-stack
 427                 parse-character-class
 428             ] }
 429             { CHAR: ] [
 430                 [ 1- ] change-bracket-count
 431                 make-character-class
 432             ] }
 433             { CHAR: - [ dash push-stack ] }
 434             ! { CHAR: & [ ampersand push-stack ] }
 435             ! { CHAR: : [ semicolon push-stack ] }
 436             { CHAR: \ [ next get-char (parse-escaped) push-stack ] }
 437             { f [ unbalanced-brackets ] }
 438             [ dupd push-stack dup apply-dash? [ apply-dash ] [ drop ] if ]
 439         } case
 440     ] [
 441         dup bracket-count>> 0 >
 442         [ (parse-character-class) ] [ drop ] if
 443     ] bi ;
 444
 445 : parse-character-class-second ( regexp -- )
 446     get-next
 447     {
 448         { CHAR: [ [ CHAR: [ push-stack next ] }
 449         { CHAR: ] [ CHAR: ] push-stack next ] }
 450         { CHAR: - [ CHAR: - push-stack next ] }
 451         [ 2drop ]
 452     } case ;
 453
 454 : parse-character-class-first ( regexp -- )
 455     get-next
 456     {
 457         { CHAR: ^ [ caret dupd push-stack next parse-character-class-second ] }
 458         { CHAR: [ [ CHAR: [ push-stack next ] }
 459         { CHAR: ] [ CHAR: ] push-stack next ] }
 460         { CHAR: - [ CHAR: - push-stack next ] }
 461         [ 2drop ]
 462     } case ;
 463
 464 : parse-character-class ( regexp -- )
 465     [ parse-character-class-first ] [ (parse-character-class) ] bi ;
 466
 467 ERROR: unsupported-token token ;
 468 : parse-token ( regexp token -- )
 469     dup {
 470         { CHAR: ^ [ drop front-anchor-construction ] }
 471         { CHAR: $ [ drop back-anchor-construction ] }
 472         { CHAR: \ [ drop parse-escaped ] }
 473         { CHAR: | [ drop dup concatenation-loop alternation push-stack ] }
 474         { CHAR: ( [ drop [ 1+ ] change-parentheses-count left-parenthesis push-stack ] }
 475         { CHAR: ) [ drop [ 1- ] change-parentheses-count left-parenthesis apply-til-last ] }
 476         { CHAR: * [ drop apply-kleene-closure ] }
 477         { CHAR: + [ drop apply-plus-closure ] }
 478         { CHAR: ? [ drop apply-question-closure ] }
 479         { CHAR: { [ drop parse-brace apply-brace-closure ] }
 480         { CHAR: [ [
 481             drop
 482             dup left-bracket push-stack
 483             [ 1+ ] change-bracket-count parse-character-class
 484         ] }
 485         ! { CHAR: } [ drop drop "brace" ] }
 486         { CHAR: . [ drop dot-construction ] }
 487         { beginning-of-text [ push-stack ] }
 488         { end-of-text [
 489             drop {
 490                 [ check-unbalanced-parentheses ]
 491                 [ concatenation-loop ]
 492                 [ beginning-of-text apply-til-last ]
 493                 [ set-start-state ]
 494             } cleave
 495         ] }
 496         [ drop push-single-nfa ]
 497     } case ;
 498
 499 : (parse-raw-regexp) ( regexp -- )
 500     get-char [ dupd parse-token next (parse-raw-regexp) ] [ drop ] if* ;
 501
 502 : parse-raw-regexp ( regexp -- )
 503     [ beginning-of-text parse-token ]
 504     [
 505         dup raw>> dup empty? [
 506             2drop
 507         ] [
 508             [ (parse-raw-regexp) ] string-parse
 509         ] if
 510     ]
 511     [ end-of-text parse-token ] tri ;
 512
 513 :: find-delta ( states obj table -- keys )
 514     obj states [
 515         table get-row at
 516         [ dup integer? [ 1array ] when unique ] [ H{ } ] if*
 517     ] with map H{ } clone [ assoc-union ] reduce keys ;
 518
 519 :: (find-closure) ( states obj assoc table -- keys )
 520     [let | size [ assoc assoc-size ] |
 521         assoc states unique assoc-union
 522         dup assoc-size size > [
 523             obj states [
 524                 table get-row at* [
 525                     dup integer? [ 1array ] when
 526                     obj rot table (find-closure)
 527                 ] [
 528                     drop
 529                 ] if
 530             ] with each
 531         ] when ] ;
 532
 533 : find-closure ( states obj table -- states )
 534     >r H{ } r> (find-closure) keys ;
 535
 536 : find-epsilon-closure ( states table -- states )
 537     >r eps H{ } r> (find-closure) keys ;
 538
 539 : filter-special-transition ( vec -- vec' )
 540     [ drop eps = not ] assoc-filter ;
 541
 542 : initialize-subset-construction ( regexp -- )
 543     <vector-table> >>dfa
 544     [
 545         nfa>> [ start-state>> 1array ] keep
 546         find-epsilon-closure 1dlist
 547     ] [
 548         swap >>new-states drop
 549     ] [
 550         [ dfa>> ] [ nfa>> ] bi
 551         columns>> filter-special-transition >>columns drop
 552     ] tri ;
 553
 554 :: (subset-construction) ( regexp -- )
 555     [let* | nfa [ regexp nfa>> ]
 556            dfa [ regexp dfa>> ]
 557            new-states [ regexp new-states>> ]
 558            columns [ dfa columns>> keys ] |
 559
 560         new-states dlist-empty? [
 561             new-states pop-front
 562             dup dfa add-row
 563             columns [
 564                 2dup nfa [ find-delta ] [ find-epsilon-closure ] bi
 565                 dup [ dfa rows>> key? ] [ empty? ] bi or [
 566                     dup new-states push-back
 567                 ] unless
 568                 dup empty? [ 3drop ] [ <entry> dfa set-entry ] if
 569             ] with each
 570             regexp (subset-construction)
 571         ] unless ] ;
 572
 573 : set-start/final-states ( regexp -- )
 574     dup [ nfa>> start-state>> ]
 575     [ dfa>> rows>> keys [ member? ] with filter first ] bi
 576     >r dup dfa>> r> >>start-state drop
 577
 578     dup [ nfa>> final-states>> ] [ dfa>> rows>> ] bi
 579     [ keys ] bi@
 580     [ intersect empty? not ] with filter
 581     >r dfa>> r> >>final-states drop ;
 582
 583 : subset-construction ( regexp -- )
 584     [ initialize-subset-construction ]
 585     [ (subset-construction) ]
 586     [ set-start/final-states ] tri ;
 587
 588 : <regexp> ( raw -- obj )
 589     regexp new
 590         swap >>raw
 591         0 >>parentheses-count
 592         0 >>bracket-count
 593         -1 >>state
 594         V{ } clone >>stack
 595         <vector-table> >>nfa
 596         dup [ parse-raw-regexp ] [ subset-construction ] bi ;
 597
 598 ! Literal syntax for regexps
 599 : parse-options ( string -- ? )
 600     #! Lame
 601     {
 602         { "" [ f ] }
 603         { "i" [ t ] }
 604     } case ;
 605
 606 : parse-regexp ( accum end -- accum )
 607     lexer get dup skip-blank
 608     [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
 609     ! lexer get dup still-parsing-line?
 610     ! [ (parse-token) parse-options ] [ drop f ] if
 611     <regexp> parsed ;
 612
 613 : R! CHAR: ! parse-regexp ; parsing
 614 : R" CHAR: " parse-regexp ; parsing
 615 : R# CHAR: # parse-regexp ; parsing
 616 : R' CHAR: ' parse-regexp ; parsing
 617 : R( CHAR: ) parse-regexp ; parsing
 618 : R/ CHAR: / parse-regexp ; parsing
 619 : R@ CHAR: @ parse-regexp ; parsing
 620 : R[ CHAR: ] parse-regexp ; parsing
 621 : R` CHAR: ` parse-regexp ; parsing
 622 : R{ CHAR: } parse-regexp ; parsing
 623 : R| CHAR: | parse-regexp ; parsing
 624
 625 TUPLE: dfa-traverser
 626     dfa
 627     last-state current-state
 628     text
 629     start-index current-index
 630     matches ;
 631
 632 : <dfa-traverser> ( text dfa -- match )
 633     dfa>>
 634     dfa-traverser new
 635         swap [ start-state>> >>current-state ] keep
 636         >>dfa
 637         swap >>text
 638         0 >>start-index
 639         0 >>current-index
 640         V{ } clone >>matches ;
 641
 642 : final-state? ( dfa-traverser -- ? )
 643     [ current-state>> ] [ dfa>> final-states>> ] bi
 644     member? ;
 645
 646 : text-finished? ( dfa-traverser -- ? )
 647     [ current-index>> ] [ text>> length ] bi >= ;
 648
 649 : save-final-state ( dfa-straverser -- )
 650     [ current-index>> ] [ matches>> ] bi push ;
 651
 652 : match-done? ( dfa-traverser -- ? )
 653     dup final-state? [
 654         dup save-final-state
 655     ] when text-finished? ;
 656
 657 : increment-state ( dfa-traverser state -- dfa-traverser )
 658     >r [ 1+ ] change-current-index
 659     dup current-state>> >>last-state r>
 660     >>current-state ;
 661
 662 : match-transition ( obj hash -- state/f )
 663     2dup keys [ callable? ] filter predicates
 664     [ swap at nip ] [ at ] if* ;
 665
 666 : do-match ( dfa-traverser -- dfa-traverser )
 667     dup match-done? [
 668         dup {
 669             [ current-index>> ]
 670             [ text>> ]
 671             [ current-state>> ]
 672             [ dfa>> rows>> ]
 673         } cleave
 674         at >r nth r> match-transition [
 675             increment-state do-match
 676         ] when*
 677     ] unless ;
 678
 679 : return-match ( dfa-traverser -- interval/f )
 680     dup matches>> empty? [
 681         drop f
 682     ] [
 683         [ start-index>> ] [ matches>> peek ] bi 1 <range>
 684     ] if ;
 685
 686 : match ( string regexp -- pair )
 687     <dfa-traverser> do-match return-match ;
 688
 689 : matches? ( string regexp -- ? )
 690     dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
 691
 692 : match-head ( string regexp -- end )
 693     match length>> 1- ;
 694
 695 ! character classes
 696 ! TUPLE: range-class from to ;
 697 ! TUPLE: or-class left right ;
 698
 699 ! (?:a|b)*  <- does not capture
 700 ! (a|b)*\1  <- group captured
 701 ! doesn't advance the current position:
 702 ! (?=abba)  positive lookahead  matches abbaaa but not abaaa
 703 ! (?!abba)  negative lookahead  matches ababa but not abbaa
 704 ! look behind.   "lookaround"
 705
 706 ! : $ ( n -- obj ) groups get nth ;
 707 ! [
 708     ! groups bound to scope here
 709 ! ] [
 710     ! error or something
 711 ! ] if-match
 712 ! match in a string with  .*foo.*