1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs grouping kernel regexp.backend
4 locals math namespaces regexp.parser sequences fry quotations
5 math.order math.ranges vectors unicode.categories regexp.utils
6 regexp.transition-tables words sets regexp.classes unicode.case.private ;
7 ! This uses unicode.case.private for ch>upper and ch>lower
8 ! but case-insensitive matching should be done by case-folding everything
9 ! before processing starts
12 ERROR: feature-is-broken feature ;
18 : options ( -- obj ) current-regexp get options>> ;
20 : option? ( obj -- ? ) options key? ;
22 : option-on ( obj -- ) options conjoin ;
24 : option-off ( obj -- ) options delete-at ;
26 : next-state ( regexp -- state )
27 [ state>> ] [ [ 1+ ] change-state drop ] bi ;
29 : set-start-state ( regexp -- )
33 [ nfa-table>> ] [ pop first ] bi* >>start-state drop
36 GENERIC: nfa-node ( node -- )
38 :: add-simple-entry ( obj class -- )
39 [let* | regexp [ current-regexp get ]
40 s0 [ regexp next-state ]
41 s1 [ regexp next-state ]
42 stack [ regexp stack>> ]
43 table [ regexp nfa-table>> ] |
45 s0 f obj class make-transition table add-transition
46 s0 s1 <default-transition> table add-transition
48 s0 s1 obj class make-transition table add-transition
50 s0 s1 2array stack push
51 t s1 table final-states>> set-at ] ;
53 :: concatenate-nodes ( -- )
54 [let* | regexp [ current-regexp get ]
55 stack [ regexp stack>> ]
56 table [ regexp nfa-table>> ]
57 s2 [ stack peek first ]
58 s3 [ stack pop second ]
59 s0 [ stack peek first ]
60 s1 [ stack pop second ] |
61 s1 s2 eps <literal-transition> table add-transition
62 s1 table final-states>> delete-at
63 s0 s3 2array stack push ] ;
65 :: alternate-nodes ( -- )
66 [let* | regexp [ current-regexp get ]
67 stack [ regexp stack>> ]
68 table [ regexp nfa-table>> ]
69 s2 [ stack peek first ]
70 s3 [ stack pop second ]
71 s0 [ stack peek first ]
72 s1 [ stack pop second ]
73 s4 [ regexp next-state ]
74 s5 [ regexp next-state ] |
75 s4 s0 eps <literal-transition> table add-transition
76 s4 s2 eps <literal-transition> table add-transition
77 s1 s5 eps <literal-transition> table add-transition
78 s3 s5 eps <literal-transition> table add-transition
79 s1 table final-states>> delete-at
80 s3 table final-states>> delete-at
81 t s5 table final-states>> set-at
82 s4 s5 2array stack push ] ;
84 M: star nfa-node ( node -- )
86 [let* | regexp [ current-regexp get ]
87 stack [ regexp stack>> ]
88 s0 [ stack peek first ]
89 s1 [ stack pop second ]
90 s2 [ regexp next-state ]
91 s3 [ regexp next-state ]
92 table [ regexp nfa-table>> ] |
93 s1 table final-states>> delete-at
94 t s3 table final-states>> set-at
95 s1 s0 eps <literal-transition> table add-transition
96 s2 s0 eps <literal-transition> table add-transition
97 s2 s3 eps <literal-transition> table add-transition
98 s1 s3 eps <literal-transition> table add-transition
99 s2 s3 2array stack push ] ;
101 M: concatenation nfa-node ( node -- )
103 reversed-regexp option? [ <reversed> ] when
104 [ [ nfa-node ] each ]
105 [ length 1- [ concatenate-nodes ] times ] bi ;
107 M: alternation nfa-node ( node -- )
109 [ [ nfa-node ] each ]
110 [ length 1- [ alternate-nodes ] times ] bi ;
112 M: constant nfa-node ( node -- )
113 case-insensitive option? [
114 dup char>> [ ch>lower ] [ ch>upper ] bi
117 char>> literal-transition add-simple-entry
119 [ literal-transition add-simple-entry ] bi@
123 char>> literal-transition add-simple-entry
126 M: word nfa-node ( node -- ) class-transition add-simple-entry ;
128 M: any-char nfa-node ( node -- )
129 [ dotall option? ] dip any-char-no-nl ?
130 class-transition add-simple-entry ;
132 M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
134 M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
136 : choose-letter-class ( node -- node' )
137 case-insensitive option? Letter-class rot ? ;
139 M: letter-class nfa-node ( node -- )
140 choose-letter-class class-transition add-simple-entry ;
142 M: LETTER-class nfa-node ( node -- )
143 choose-letter-class class-transition add-simple-entry ;
145 M: character-class-range nfa-node ( node -- )
146 case-insensitive option? [
147 ! This should be implemented for Unicode by case-folding
148 ! the input and all strings in the regexp.
149 dup [ from>> ] [ to>> ] bi
150 2dup [ Letter? ] bi@ and [
152 [ [ ch>lower ] bi@ character-class-range boa ]
153 [ [ ch>upper ] bi@ character-class-range boa ] 2bi
154 [ class-transition add-simple-entry ] bi@
158 class-transition add-simple-entry
161 class-transition add-simple-entry
164 M: option nfa-node ( node -- )
165 [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
166 eps literal-transition add-simple-entry ;
168 : construct-nfa ( regexp -- )
171 [ current-regexp set ]
172 [ parse-tree>> nfa-node ]
173 [ set-start-state ] tri