1 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs grouping kernel locals math namespaces
4 sequences fry quotations math.order math.ranges vectors
5 unicode.categories regexp.transition-tables words sets hashtables
6 combinators.short-circuit unicode.data regexp.ast
7 regexp.classes memoize ;
8 FROM: namespaces => set ;
11 ! This uses unicode.data for ch>upper and ch>lower
12 ! but case-insensitive matching should be done by case-folding everything
13 ! before processing starts
19 : next-state ( -- state )
20 state [ get ] [ inc ] bi ;
24 : set-each ( keys value hashtable -- )
25 '[ _ swap _ set-at ] each ;
27 : options>hash ( options -- hashtable )
29 [ [ on>> t ] dip set-each ]
30 [ [ off>> f ] dip set-each ] 2bi
33 : using-options ( options quot -- )
34 [ options>hash option-stack [ ?push ] change ] dip
35 call option-stack get pop* ; inline
37 : option? ( obj -- ? )
38 option-stack get assoc-stack ;
40 GENERIC: nfa-node ( node -- start-state end-state )
42 : add-simple-entry ( obj -- start-state end-state )
43 [ next-state next-state 2dup ] dip
44 nfa-table get add-transition ;
46 : epsilon-transition ( source target -- )
47 epsilon nfa-table get add-transition ;
49 M:: star nfa-node ( node -- start end )
50 node term>> nfa-node :> ( s0 s1 )
53 s1 s0 epsilon-transition
54 s2 s0 epsilon-transition
55 s2 s3 epsilon-transition
56 s1 s3 epsilon-transition
59 GENERIC: modify-epsilon ( tag -- newtag )
60 ! Potential off-by-one errors when lookaround nested in lookbehind
62 M: object modify-epsilon ;
64 : line-option ( multiline unix-lines default -- option )
66 drop [ unix-lines option? ] 2dip swap ?
70 $unix end-of-input line-option ;
73 ^unix beginning-of-input line-option ;
75 M: tagged-epsilon nfa-node
76 clone [ modify-epsilon ] change-tag add-simple-entry ;
78 M: concatenation nfa-node ( node -- start end )
79 [ first>> ] [ second>> ] bi
80 reversed-regexp option? [ swap ] when
82 [ epsilon-transition ] dip ;
84 :: alternate-nodes ( s0 s1 s2 s3 -- start end )
87 s4 s0 epsilon-transition
88 s4 s2 epsilon-transition
89 s1 s5 epsilon-transition
90 s3 s5 epsilon-transition
93 M: alternation nfa-node ( node -- start end )
94 [ first>> ] [ second>> ] bi
98 GENERIC: modify-class ( char-class -- char-class' )
100 M: object modify-class ;
102 M: integer modify-class
103 case-insensitive option? [
105 [ ch>lower ] [ ch>upper ] bi 2array <or-class>
109 M: integer nfa-node ( node -- start end )
110 modify-class add-simple-entry ;
112 M: primitive-class modify-class
113 class>> modify-class <primitive-class> ;
115 M: or-class modify-class
116 seq>> [ modify-class ] map <or-class> ;
118 M: not-class modify-class
119 class>> modify-class <not-class> ;
121 MEMO: unix-dot ( -- class )
122 CHAR: \n <not-class> ;
124 MEMO: nonl-dot ( -- class )
125 { CHAR: \n CHAR: \r } <or-class> <not-class> ;
128 drop dotall option? [ t ] [
133 : modify-letter-class ( class -- newclass )
134 case-insensitive option? [ drop Letter-class ] when ;
135 M: letter-class modify-class modify-letter-class ;
136 M: LETTER-class modify-class modify-letter-class ;
138 : cased-range? ( range -- ? )
139 [ from>> ] [ to>> ] bi {
140 [ [ letter? ] bi@ and ]
141 [ [ LETTER? ] bi@ and ]
144 M: range-class modify-class
145 case-insensitive option? [
147 [ from>> ] [ to>> ] bi
148 [ [ ch>lower ] bi@ <range-class> ]
149 [ [ ch>upper ] bi@ <range-class> ] 2bi
155 modify-class add-simple-entry ;
157 M: with-options nfa-node ( node -- start end )
158 dup options>> [ tree>> nfa-node ] using-options ;
160 : construct-nfa ( ast -- nfa-table )
163 <transition-table> nfa-table set
166 swap dup associate >>final-states