1 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators.short-circuit kernel
4 math namespaces regexp.ast regexp.classes
5 regexp.transition-tables sequences sets unicode vectors ;
8 ! This uses unicode for ch>upper and ch>lower
9 ! but case-insensitive matching should be done by case-folding everything
10 ! before processing starts
16 : next-state ( -- state )
17 state [ get ] [ inc ] bi ;
21 : set-each ( keys value hashtable -- )
22 '[ _ swap _ set-at ] each ;
24 : options>hash ( options -- hashtable )
26 [ [ on>> t ] dip set-each ]
27 [ [ off>> f ] dip set-each ] 2bi
30 : using-options ( options quot -- )
31 [ options>hash option-stack [ ?push ] change ] dip
32 call option-stack get pop* ; inline
34 : option? ( obj -- ? )
35 option-stack get assoc-stack ;
37 GENERIC: nfa-node ( node -- start-state end-state )
39 : add-simple-entry ( obj -- start-state end-state )
40 [ next-state next-state 2dup ] dip
41 nfa-table get add-transition ;
43 : epsilon-transition ( source target -- )
44 epsilon nfa-table get add-transition ;
46 M:: star nfa-node ( node -- start end )
47 node term>> nfa-node :> ( s0 s1 )
50 s1 s0 epsilon-transition
51 s2 s0 epsilon-transition
52 s2 s3 epsilon-transition
53 s1 s3 epsilon-transition
56 GENERIC: modify-epsilon ( tag -- newtag )
57 ! Potential off-by-one errors when lookaround nested in lookbehind
59 M: object modify-epsilon ;
61 : line-option ( multiline unix-lines default -- option )
63 drop [ unix-lines option? ] 2dip swap ?
66 M: $crlf modify-epsilon
67 $unix end-of-input line-option ;
69 M: ^crlf modify-epsilon
70 ^unix beginning-of-input line-option ;
72 M: tagged-epsilon nfa-node
73 clone [ modify-epsilon ] change-tag add-simple-entry ;
75 M: concatenation nfa-node
76 [ first>> ] [ second>> ] bi
77 reversed-regexp option? [ swap ] when
79 [ epsilon-transition ] dip ;
81 :: alternate-nodes ( s0 s1 s2 s3 -- start end )
84 s4 s0 epsilon-transition
85 s4 s2 epsilon-transition
86 s1 s5 epsilon-transition
87 s3 s5 epsilon-transition
90 M: alternation nfa-node
91 [ first>> ] [ second>> ] bi
95 GENERIC: modify-class ( char-class -- char-class' )
97 M: object modify-class ;
99 M: integer modify-class
100 case-insensitive option? [
102 [ ch>lower ] [ ch>upper ] bi 2array <or-class>
107 modify-class add-simple-entry ;
109 M: primitive-class modify-class
110 class>> modify-class <primitive-class> ;
112 M: or-class modify-class
113 seq>> [ modify-class ] map <or-class> ;
115 M: not-class modify-class
116 class>> modify-class <not-class> ;
118 MEMO: unix-dot ( -- class )
119 CHAR: \n <not-class> ;
121 MEMO: nonl-dot ( -- class )
122 { CHAR: \n CHAR: \r } <or-class> <not-class> ;
125 drop dotall option? [ t ] [
130 : modify-letter-class ( class -- newclass )
131 case-insensitive option? [ drop Letter-class ] when ;
132 M: letter-class modify-class modify-letter-class ;
133 M: LETTER-class modify-class modify-letter-class ;
135 : cased-range? ( range -- ? )
136 [ from>> ] [ to>> ] bi {
137 [ [ letter? ] both? ]
138 [ [ LETTER? ] both? ]
141 M: range-class modify-class
142 case-insensitive option? [
144 [ from>> ] [ to>> ] bi
145 [ [ ch>lower ] bi@ <range-class> ]
146 [ [ ch>upper ] bi@ <range-class> ] 2bi
152 modify-class add-simple-entry ;
154 M: with-options nfa-node
155 dup options>> [ tree>> nfa-node ] using-options ;
157 : construct-nfa ( ast -- nfa-table )
159 0 state namespaces:set
160 <transition-table> nfa-table namespaces:set
163 swap 1array fast-set >>final-states