1 USING: assocs combinators.lib kernel math math.parser
2 namespaces peg unicode.case sequences unicode.categories
3 memoize peg.parsers math.order ;
12 : char=-quot ( ch -- quot )
14 [ ch>upper [ swap ch>upper = ] ] [ [ = ] ] if
17 : char-between?-quot ( ch1 ch2 -- quot )
19 [ [ ch>upper ] bi@ [ >r >r ch>upper r> r> between? ] ]
23 : or-predicates ( quots -- quot )
24 [ \ dup prefix ] map [ [ t ] ] f short-circuit \ nip suffix ;
26 : literal-action [ nip ] curry action ;
28 : delay-action [ curry ] curry action ;
35 : octal-digit? ( n -- ? )
36 CHAR: 0 CHAR: 7 between? ;
38 : hex-digit? ( n -- ? )
41 [ dup CHAR: a CHAR: f between? ]
42 [ dup CHAR: A CHAR: F between? ]
45 : control-char? ( n -- ? )
46 { [ dup 0 HEX: 1f between? ] [ dup HEX: 7f = ] } || nip ;
49 "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
51 : c-identifier-char? ( ch -- ? )
52 { [ dup alpha? ] [ dup CHAR: _ = ] } || nip ;
54 : java-blank? ( n -- ? )
57 CHAR: \t CHAR: \n CHAR: \r
61 : java-printable? ( n -- ? )
62 { [ dup alpha? ] [ dup punct? ] } || nip ;
64 MEMO: 'ordinary-char' ( -- parser )
65 [ "\\^*+?|(){}[$" member? not ] satisfy
66 [ char=-quot ] action ;
68 MEMO: 'octal-digit' ( -- parser ) [ octal-digit? ] satisfy ;
70 MEMO: 'octal' ( -- parser )
71 "0" token hide 'octal-digit' 1 3 from-m-to-n 2seq
72 [ first oct> ] action ;
74 MEMO: 'hex-digit' ( -- parser ) [ hex-digit? ] satisfy ;
76 MEMO: 'hex' ( -- parser )
77 "x" token hide 'hex-digit' 2 exactly-n 2seq
78 "u" token hide 'hex-digit' 6 exactly-n 2seq 2choice
79 [ first hex> ] action ;
81 : satisfy-tokens ( assoc -- parser )
82 [ >r token r> literal-action ] { } assoc>map choice ;
84 MEMO: 'simple-escape-char' ( -- parser )
93 } [ char=-quot ] assoc-map satisfy-tokens ;
95 MEMO: 'predefined-char-class' ( -- parser )
98 { "D" [ digit? not ] }
99 { "s" [ java-blank? ] }
100 { "S" [ java-blank? not ] }
101 { "w" [ c-identifier-char? ] }
102 { "W" [ c-identifier-char? not ] }
105 MEMO: 'posix-character-class' ( -- parser )
107 { "Lower" [ letter? ] }
108 { "Upper" [ LETTER? ] }
109 { "ASCII" [ ascii? ] }
110 { "Alpha" [ Letter? ] }
111 { "Digit" [ digit? ] }
112 { "Alnum" [ alpha? ] }
113 { "Punct" [ punct? ] }
114 { "Graph" [ java-printable? ] }
115 { "Print" [ java-printable? ] }
116 { "Blank" [ " \t" member? ] }
117 { "Cntrl" [ control-char? ] }
118 { "XDigit" [ hex-digit? ] }
119 { "Space" [ java-blank? ] }
120 } satisfy-tokens "p{" "}" surrounded-by ;
122 MEMO: 'simple-escape' ( -- parser )
126 "c" token hide [ LETTER? ] satisfy 2seq ,
128 ] choice* [ char=-quot ] action ;
130 MEMO: 'escape' ( -- parser )
132 'simple-escape-char' ,
133 'predefined-char-class' ,
134 'posix-character-class' ,
138 MEMO: 'any-char' ( -- parser )
139 "." token [ drop t ] literal-action ;
141 MEMO: 'char' ( -- parser )
142 'any-char' 'escape' 'ordinary-char' 3choice [ satisfy ] action ;
146 TUPLE: group-result str ;
148 C: <group-result> group-result
150 MEMO: 'non-capturing-group' ( -- parser )
151 "?:" token hide 'regexp' ;
153 MEMO: 'positive-lookahead-group' ( -- parser )
154 "?=" token hide 'regexp' [ ensure ] action ;
156 MEMO: 'negative-lookahead-group' ( -- parser )
157 "?!" token hide 'regexp' [ ensure-not ] action ;
159 MEMO: 'simple-group' ( -- parser )
160 'regexp' [ [ <group-result> ] action ] action ;
162 MEMO: 'group' ( -- parser )
164 'non-capturing-group' ,
165 'positive-lookahead-group' ,
166 'negative-lookahead-group' ,
168 ] choice* "(" ")" surrounded-by ;
170 MEMO: 'range' ( -- parser )
171 any-char "-" token hide any-char 3seq
172 [ first2 char-between?-quot ] action ;
174 MEMO: 'character-class-term' ( -- parser )
177 [ "\\]" member? not ] satisfy [ char=-quot ] action
180 MEMO: 'positive-character-class' ( -- parser )
182 "]" token [ CHAR: ] = ] literal-action 'character-class-term' repeat0 2seq
183 'character-class-term' repeat1 2choice [ or-predicates ] action ;
185 MEMO: 'negative-character-class' ( -- parser )
186 "^" token hide 'positive-character-class' 2seq
187 [ [ not ] append ] action ;
189 MEMO: 'character-class' ( -- parser )
190 'negative-character-class' 'positive-character-class' 2choice
191 "[" "]" surrounded-by [ satisfy ] action ;
193 MEMO: 'escaped-seq' ( -- parser )
195 [ ignore-case? get token ] action "\\Q" "\\E" surrounded-by ;
197 MEMO: 'break' ( quot -- parser )
199 epsilon just 2choice ;
201 MEMO: 'break-escape' ( -- parser )
202 "$" token [ "\r\n" member? ] 'break' literal-action
203 "\\b" token [ blank? ] 'break' literal-action
204 "\\B" token [ blank? not ] 'break' literal-action
205 "\\z" token epsilon just literal-action 4choice ;
207 MEMO: 'simple' ( -- parser )
216 MEMO: 'exactly-n' ( -- parser )
217 'integer' [ exactly-n ] delay-action ;
219 MEMO: 'at-least-n' ( -- parser )
220 'integer' "," token hide 2seq [ at-least-n ] delay-action ;
222 MEMO: 'at-most-n' ( -- parser )
223 "," token hide 'integer' 2seq [ at-most-n ] delay-action ;
225 MEMO: 'from-m-to-n' ( -- parser )
226 'integer' "," token hide 'integer' 3seq
227 [ first2 from-m-to-n ] delay-action ;
229 MEMO: 'greedy-interval' ( -- parser )
230 'exactly-n' 'at-least-n' 'at-most-n' 'from-m-to-n' 4choice ;
232 MEMO: 'interval' ( -- parser )
234 'greedy-interval' "?" token hide 2seq [ "reluctant {}" print ] action
235 'greedy-interval' "+" token hide 2seq [ "possessive {}" print ] action
236 3choice "{" "}" surrounded-by ;
238 MEMO: 'repetition' ( -- parser )
241 ! "*+" token [ <!*> ] literal-action ,
242 ! "++" token [ <!+> ] literal-action ,
243 ! "?+" token [ <!?> ] literal-action ,
245 ! "*?" token [ <(*)> ] literal-action ,
246 ! "+?" token [ <(+)> ] literal-action ,
247 ! "??" token [ <(?)> ] literal-action ,
249 "*" token [ repeat0 ] literal-action ,
250 "+" token [ repeat1 ] literal-action ,
251 "?" token [ optional ] literal-action ,
254 MEMO: 'dummy' ( -- parser )
255 epsilon [ ] literal-action ;
257 ! todo -- check the action
258 ! MEMO: 'term' ( -- parser )
260 ! 'repetition' 'interval' 'dummy' 3choice 2seq [ first2 call ] action
261 ! <!+> [ <and-parser> ] action ;