MIXIN: traversal-flag
SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
+SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
+SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
lookahead-off add-traversal-flag
2 [ concatenate-nodes ] times ;
+M: lookbehind nfa-node ( node -- )
+ eps literal-transition add-simple-entry
+ lookbehind-on add-traversal-flag
+ term>> nfa-node
+ eps literal-transition add-simple-entry
+ lookbehind-off add-traversal-flag
+ 2 [ concatenate-nodes ] times ;
+
: construct-nfa ( regexp -- )
[
reset-regexp
[ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop 0 max exactly-n ] if ;
+SINGLETON: beginning-of-input
+SINGLETON: end-of-input
+
+! : beginning-of-input ( -- obj )
: handle-front-anchor ( -- ) front-anchor push-stack ;
-: handle-back-anchor ( -- ) back-anchor push-stack ;
+: end-of-line ( -- obj )
+ end-of-input
+ CHAR: \r <constant>
+ CHAR: \n <constant>
+ 2dup 2array <concatenation> 4array <alternation> lookahead boa ;
+
+: handle-back-anchor ( -- ) end-of-line push-stack ;
ERROR: bad-character-class obj ;
ERROR: expected-posix-class ;
read1
{
{ CHAR: \ [ CHAR: \ <constant> ] }
+ { CHAR: ^ [ CHAR: ^ <constant> ] }
+ { CHAR: $ [ CHAR: $ <constant> ] }
{ CHAR: - [ CHAR: - <constant> ] }
{ CHAR: { [ CHAR: { <constant> ] }
{ CHAR: } [ CHAR: } <constant> ] }
{ CHAR: + [ CHAR: + <constant> ] }
{ CHAR: ? [ CHAR: ? <constant> ] }
{ CHAR: . [ CHAR: . <constant> ] }
-! xyzzy
{ CHAR: : [ CHAR: : <constant> ] }
{ CHAR: t [ CHAR: \t <constant> ] }
{ CHAR: n [ CHAR: \n <constant> ] }
{ CHAR: f [ HEX: c <constant> ] }
{ CHAR: a [ HEX: 7 <constant> ] }
{ CHAR: e [ HEX: 1b <constant> ] }
- { CHAR: $ [ CHAR: $ <constant> ] }
- { CHAR: ^ [ CHAR: ^ <constant> ] }
{ CHAR: d [ digit-class ] }
{ CHAR: D [ digit-class <negation> ] }
! { CHAR: G [ end of previous match ] }
! { CHAR: Z [ handle-end-of-input ] }
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
-! xyzzy
- { CHAR: 1 [ CHAR: 1 <constant> ] }
- { CHAR: 2 [ CHAR: 2 <constant> ] }
- { CHAR: 3 [ CHAR: 3 <constant> ] }
- { CHAR: 4 [ CHAR: 4 <constant> ] }
- { CHAR: 5 [ CHAR: 5 <constant> ] }
- { CHAR: 6 [ CHAR: 6 <constant> ] }
- { CHAR: 7 [ CHAR: 7 <constant> ] }
- { CHAR: 8 [ CHAR: 8 <constant> ] }
- { CHAR: 9 [ CHAR: 9 <constant> ] }
+
+ ! { CHAR: 1 [ CHAR: 1 <constant> ] }
+ ! { CHAR: 2 [ CHAR: 2 <constant> ] }
+ ! { CHAR: 3 [ CHAR: 3 <constant> ] }
+ ! { CHAR: 4 [ CHAR: 4 <constant> ] }
+ ! { CHAR: 5 [ CHAR: 5 <constant> ] }
+ ! { CHAR: 6 [ CHAR: 6 <constant> ] }
+ ! { CHAR: 7 [ CHAR: 7 <constant> ] }
+ ! { CHAR: 8 [ CHAR: 8 <constant> ] }
+ ! { CHAR: 9 [ CHAR: 9 <constant> ] }
{ CHAR: Q [ parse-escaped-literals ] }
[ unrecognized-escape ]
: parse-regexp-token ( token -- ? )
{
+! todo: only match these at beginning/end of regexp
+ { CHAR: ^ [ handle-front-anchor t ] }
+ { CHAR: $ [ handle-back-anchor t ] }
+
{ CHAR: . [ handle-dot t ] }
{ CHAR: ( [ handle-left-parenthesis t ] }
{ CHAR: ) [ handle-right-parenthesis f ] }
{ CHAR: + [ handle-plus t ] }
{ CHAR: { [ handle-left-brace t ] }
{ CHAR: [ [ handle-left-bracket t ] }
- { CHAR: ^ [ handle-front-anchor t ] }
- { CHAR: $ [ handle-back-anchor t ] }
{ CHAR: \ [ handle-escape t ] }
[ <constant> push-stack t ]
} case ;
TUPLE: dfa-traverser
dfa-table
traversal-flags
+ traverse-forward
capture-groups
{ capture-group-index integer }
lookahead-counters
+ lookbehind-counters
last-state current-state
text
start-index current-index
swap [ start-state>> >>current-state ] keep
>>dfa-table
swap >>text
+ t >>traverse-forward
0 >>start-index
0 >>current-index
V{ } clone >>matches
V{ } clone >>capture-groups
+ V{ } clone >>lookbehind-counters
V{ } clone >>lookahead-counters ;
: final-state? ( dfa-traverser -- ? )
M: lookahead-off flag-action ( dfa-traverser flag -- )
drop
- dup lookahead-counters>> pop
- '[ _ - ] change-current-index drop ;
+ dup lookahead-counters>>
+ [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
+
+M: lookbehind-on flag-action ( dfa-traverser flag -- )
+ drop
+ f >>traverse-forward
+ lookbehind-counters>> 0 swap push ;
+
+M: lookbehind-off flag-action ( dfa-traverser flag -- )
+ drop
+ t >>traverse-forward
+ dup lookbehind-counters>>
+ [ drop ] [ pop '[ _ + ] change-current-index drop ] if-empty ;
: process-flags ( dfa-traverser -- )
[ [ 1+ ] map ] change-lookahead-counters
: increment-state ( dfa-traverser state -- dfa-traverser )
[
- [ 1+ ] change-current-index dup current-state>> >>last-state
+ dup traverse-forward>>
+ [ [ 1+ ] change-current-index ]
+ [ [ 1- ] change-current-index ] if
+ dup current-state>> >>last-state
] dip
first >>current-state ;