1 ! Copyright (C) 2007 Chris Double.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors assocs combinators combinators.short-circuit
4 effects kernel make math.parser multiline namespaces parser peg
5 peg.parsers quotations sequences sequences.deep splitting
6 stack-checker strings strings.parser summary unicode.categories
8 FROM: vocabs.parser => search ;
9 FROM: peg.search => replace ;
12 : rule ( name word -- parser )
13 ! Given an EBNF word produced from EBNF: return the EBNF rule
14 "ebnf-parser" word-prop at ;
16 ERROR: no-rule rule parser ;
20 : lookup-rule ( rule parser -- rule' )
21 2dup rule [ 2nip ] [ no-rule ] if* ;
23 TUPLE: tokenizer-tuple any one many ;
25 : default-tokenizer ( -- tokenizer )
29 [ [ = ] curry any-char swap semantic ]
32 : parser-tokenizer ( parser -- tokenizer )
34 [ swap [ = ] curry semantic ] curry dup tokenizer-tuple boa ;
36 : rule-tokenizer ( name word -- tokenizer )
37 rule parser-tokenizer ;
39 : tokenizer ( -- word )
40 \ tokenizer get-global [ default-tokenizer ] unless* ;
42 : reset-tokenizer ( -- )
43 default-tokenizer \ tokenizer set-global ;
45 TUPLE: ebnf-non-terminal symbol ;
46 TUPLE: ebnf-terminal symbol ;
47 TUPLE: ebnf-foreign word rule ;
48 TUPLE: ebnf-any-character ;
49 TUPLE: ebnf-range pattern ;
50 TUPLE: ebnf-ensure group ;
51 TUPLE: ebnf-ensure-not group ;
52 TUPLE: ebnf-choice options ;
53 TUPLE: ebnf-sequence elements ;
54 TUPLE: ebnf-ignore group ;
55 TUPLE: ebnf-repeat0 group ;
56 TUPLE: ebnf-repeat1 group ;
57 TUPLE: ebnf-optional group ;
58 TUPLE: ebnf-whitespace group ;
59 TUPLE: ebnf-tokenizer elements ;
60 TUPLE: ebnf-rule symbol elements ;
61 TUPLE: ebnf-action parser code ;
62 TUPLE: ebnf-var parser name ;
63 TUPLE: ebnf-semantic parser code ;
66 C: <ebnf-non-terminal> ebnf-non-terminal
67 C: <ebnf-terminal> ebnf-terminal
68 C: <ebnf-foreign> ebnf-foreign
69 C: <ebnf-any-character> ebnf-any-character
70 C: <ebnf-range> ebnf-range
71 C: <ebnf-ensure> ebnf-ensure
72 C: <ebnf-ensure-not> ebnf-ensure-not
73 C: <ebnf-choice> ebnf-choice
74 C: <ebnf-sequence> ebnf-sequence
75 C: <ebnf-ignore> ebnf-ignore
76 C: <ebnf-repeat0> ebnf-repeat0
77 C: <ebnf-repeat1> ebnf-repeat1
78 C: <ebnf-optional> ebnf-optional
79 C: <ebnf-whitespace> ebnf-whitespace
80 C: <ebnf-tokenizer> ebnf-tokenizer
81 C: <ebnf-rule> ebnf-rule
82 C: <ebnf-action> ebnf-action
83 C: <ebnf-var> ebnf-var
84 C: <ebnf-semantic> ebnf-semantic
87 : filter-hidden ( seq -- seq )
88 ! Remove elements that produce no AST from sequence
89 [ ebnf-ensure-not? ] reject [ ebnf-ensure? not ] filter ;
91 : syntax ( string -- parser )
92 ! Parses the string, ignoring white space, and
93 ! does not put the result in the AST.
96 : syntax-pack ( begin parser end -- parser )
97 ! Parse parser-parser surrounded by syntax elements
99 [ syntax ] 2dip syntax pack ;
101 : insert-escapes ( string -- string )
103 "\t" token [ drop "\\t" ] action ,
104 "\n" token [ drop "\\n" ] action ,
105 "\r" token [ drop "\\r" ] action ,
108 : identifier-parser ( -- parser )
109 ! Return a parser that parses an identifer delimited by
110 ! a quotation character. The quotation can be single
111 ! or double quotes. The AST produced is the identifier
112 ! between the quotes.
115 [ CHAR: \ = ] satisfy
116 [ "\"\\" member? ] satisfy 2seq ,
117 [ CHAR: " = not ] satisfy ,
118 ] choice* repeat1 "\"" "\"" surrounded-by ,
119 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
120 ] choice* [ "" flatten-as unescape-string ] action ;
122 : non-terminal-parser ( -- parser )
123 ! A non-terminal is the name of another rule. It can
124 ! be any non-blank character except for characters used
125 ! in the EBNF syntax itself.
129 [ "\"'|{}=)(][.!&*+?:~<>" member? ]
131 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
133 : terminal-parser ( -- parser )
134 ! A terminal is an identifier enclosed in quotations
135 ! and it represents the literal value of the identifier.
136 identifier-parser [ <ebnf-terminal> ] action ;
138 : foreign-name-parser ( -- parser )
139 ! Parse a valid foreign parser name
145 ] satisfy repeat1 [ >string ] action ;
147 : foreign-parser ( -- parser )
148 ! A foreign call is a call to a rule in another ebnf grammar
151 foreign-name-parser sp ,
152 foreign-name-parser sp optional ,
154 ] seq* [ first2 <ebnf-foreign> ] action ;
156 : any-character-parser ( -- parser )
157 ! A parser to match the symbol for any character match.
158 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
160 : range-parser-parser ( -- parser )
161 ! Match the syntax for declaring character ranges
163 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
164 [ CHAR: ] = not ] satisfy repeat1 ,
166 ] seq* [ first >string unescape-string <ebnf-range> ] action ;
168 : (element-parser) ( -- parser )
169 ! An element of a rule. It can be a terminal or a
170 ! non-terminal but must not be followed by a "=".
171 ! The latter indicates that it is the beginning of a
176 non-terminal-parser ,
179 range-parser-parser ,
180 any-character-parser ,
182 [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
183 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
184 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
185 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
189 "=" syntax ensure-not ,
192 ] seq* [ first ] action ;
196 : element-parser ( -- parser )
199 (element-parser) , ":" syntax ,
200 "a-zA-Z_" range-pattern
201 "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
202 ] seq* [ first2 <ebnf-var> ] action ,
208 : grouped ( quot suffix -- parser )
209 ! Parse a group of choices, with a suffix indicating
210 ! the type of group (repeat0, repeat1, etc) and
211 ! an quot that is the action that produces the AST.
214 "(" [ choice-parser sp ] delay ")" syntax-pack
216 [ first ] rot compose action ,
217 "{" [ choice-parser sp ] delay "}" syntax-pack
219 [ first <ebnf-whitespace> ] rot compose action ,
222 : group-parser ( -- parser )
223 ! A grouping with no suffix. Used for precedence.
225 "~" token sp ensure-not ,
226 "*" token sp ensure-not ,
227 "+" token sp ensure-not ,
228 "?" token sp ensure-not ,
229 ] seq* hide grouped ;
231 : ignore-parser ( -- parser )
232 [ <ebnf-ignore> ] "~" syntax grouped ;
234 : repeat0-parser ( -- parser )
235 [ <ebnf-repeat0> ] "*" syntax grouped ;
237 : repeat1-parser ( -- parser )
238 [ <ebnf-repeat1> ] "+" syntax grouped ;
240 : optional-parser ( -- parser )
241 [ <ebnf-optional> ] "?" syntax grouped ;
243 : factor-code-parser ( -- parser )
245 "]]" token ensure-not ,
246 "]?" token ensure-not ,
248 ] seq* repeat0 [ "" concat-as ] action ;
250 : ensure-not-parser ( -- parser )
251 ! Parses the '!' syntax to ensure that
252 ! something that matches the following elements do
253 ! not exist in the parse stream.
257 ] seq* [ first <ebnf-ensure-not> ] action ;
259 : ensure-parser ( -- parser )
260 ! Parses the '&' syntax to ensure that
261 ! something that matches the following elements does
262 ! exist in the parse stream.
266 ] seq* [ first <ebnf-ensure> ] action ;
268 : (sequence-parser) ( -- parser )
269 ! A sequence of terminals and non-terminals, including
270 ! groupings of those.
273 ensure-not-parser sp ,
282 [ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
286 : action-parser ( -- parser )
287 "[[" factor-code-parser "]]" syntax-pack ;
289 : semantic-parser ( -- parser )
290 "?[" factor-code-parser "]?" syntax-pack ;
292 : sequence-parser ( -- parser )
293 ! A sequence of terminals and non-terminals, including
294 ! groupings of those.
296 [ (sequence-parser) , action-parser , ] seq*
297 [ first2 <ebnf-action> ] action ,
299 [ (sequence-parser) , semantic-parser , ] seq*
300 [ first2 <ebnf-semantic> ] action ,
304 dup length 1 = [ first ] [ <ebnf-sequence> ] if
307 : actioned-sequence-parser ( -- parser )
309 [ sequence-parser , "=>" syntax , action-parser , ] seq*
310 [ first2 <ebnf-action> ] action ,
314 : choice-parser ( -- parser )
315 actioned-sequence-parser sp repeat1 [
316 dup length 1 = [ first ] [ <ebnf-sequence> ] if
317 ] action "|" token sp list-of [
318 dup length 1 = [ first ] [ <ebnf-choice> ] if
321 : tokenizer-parser ( -- parser )
325 ">" token ensure-not ,
326 [ "default" token sp , choice-parser , ] choice* ,
327 ] seq* [ first <ebnf-tokenizer> ] action ;
329 : rule-parser ( -- parser )
331 "tokenizer" token ensure-not ,
332 non-terminal-parser [ symbol>> ] action ,
334 ">" token ensure-not ,
336 ] seq* [ first2 <ebnf-rule> ] action ;
338 : ebnf-parser ( -- parser )
339 [ tokenizer-parser sp , rule-parser sp , ] choice* repeat1 [ <ebnf> ] action ;
341 GENERIC: (transform) ( ast -- parser )
347 : transform ( ast -- object )
355 M: ebnf (transform) ( ast -- parser )
356 rules>> [ (transform) ] map last ;
358 M: ebnf-tokenizer (transform) ( ast -- parser )
359 elements>> dup "default" = [
360 drop default-tokenizer \ tokenizer set-global any-char
363 dup parser-tokenizer \ tokenizer set-global
366 ERROR: redefined-rule name ;
368 M: redefined-rule summary
369 name>> "Rule '" "' defined more than once" surround ;
371 M: ebnf-rule (transform) ( ast -- parser )
374 swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
377 M: ebnf-sequence (transform) ( ast -- parser )
378 ! If ignore-ws is set then each element of the sequence
379 ! ignores leading whitespace. This is not inherited by
380 ! subelements of the sequence.
382 f ignore-ws [ (transform) ] with-variable
383 ignore-ws get [ sp ] when
384 ] map seq [ dup length 1 = [ first ] when ] action ;
386 M: ebnf-choice (transform) ( ast -- parser )
387 options>> [ (transform) ] map choice ;
389 M: ebnf-any-character (transform) ( ast -- parser )
390 drop tokenizer any>> call( -- parser ) ;
392 M: ebnf-range (transform) ( ast -- parser )
393 pattern>> range-pattern ;
395 : transform-group ( ast -- parser )
396 ! convert a ast node with groups to a parser for that group
397 group>> (transform) ;
399 M: ebnf-ensure (transform) ( ast -- parser )
400 transform-group ensure ;
402 M: ebnf-ensure-not (transform) ( ast -- parser )
403 transform-group ensure-not ;
405 M: ebnf-ignore (transform) ( ast -- parser )
406 transform-group [ drop ignore ] action ;
408 M: ebnf-repeat0 (transform) ( ast -- parser )
409 transform-group repeat0 ;
411 M: ebnf-repeat1 (transform) ( ast -- parser )
412 transform-group repeat1 ;
414 M: ebnf-optional (transform) ( ast -- parser )
415 transform-group optional ;
417 M: ebnf-whitespace (transform) ( ast -- parser )
418 t ignore-ws [ transform-group ] with-variable ;
420 GENERIC: build-locals ( code ast -- code )
422 M: ebnf-sequence build-locals ( code ast -- code )
423 ! Note the need to filter out this ebnf items that
424 ! leave nothing in the AST
425 elements>> filter-hidden dup length 1 = [
428 dup [ ebnf-var? ] any? not [
432 "FROM: locals => [let :> ; FROM: sequences => nth ; FROM: kernel => nip over ; [let " %
435 " " % # " over nth :> " %
448 M: ebnf-var build-locals ( code ast -- code )
450 "FROM: locals => [let :> ; FROM: kernel => dup nip ; [let " %
451 " dup :> " % name>> %
457 M: object build-locals ( code ast -- code )
460 ERROR: bad-effect quot effect ;
462 : check-action-effect ( quot -- quot )
464 { [ dup ( a -- b ) effect<= ] [ drop ] }
465 { [ dup ( -- b ) effect<= ] [ drop [ drop ] prepose ] }
469 : ebnf-transform ( ast -- parser quot )
470 [ parser>> (transform) ]
471 [ code>> insert-escapes ]
472 [ parser>> ] tri build-locals
473 string-lines parse-lines ;
475 M: ebnf-action (transform) ( ast -- parser )
476 ebnf-transform check-action-effect action ;
478 M: ebnf-semantic (transform) ( ast -- parser )
479 ebnf-transform semantic ;
481 M: ebnf-var (transform) ( ast -- parser )
482 parser>> (transform) ;
484 M: ebnf-terminal (transform) ( ast -- parser )
485 symbol>> tokenizer one>> call( symbol -- parser ) ;
487 ERROR: ebnf-foreign-not-found name ;
489 M: ebnf-foreign-not-found summary
490 name>> "Foreign word '" "' not found" surround ;
492 M: ebnf-foreign (transform) ( ast -- parser )
493 dup word>> search [ word>> ebnf-foreign-not-found ] unless*
494 swap rule>> [ main ] unless* over rule [
500 ERROR: parser-not-found name ;
502 M: ebnf-non-terminal (transform) ( ast -- parser )
504 , \ dup , parser get , \ at ,
505 [ parser-not-found ] , \ unless* , \ nip ,
508 : transform-ebnf ( string -- object )
509 ebnf-parser parse transform ;
511 ERROR: unable-to-fully-parse-ebnf remaining ;
513 ERROR: could-not-parse-ebnf ;
515 : check-parse-result ( result -- result )
517 dup remaining>> [ blank? ] trim [
518 unable-to-fully-parse-ebnf
524 : parse-ebnf ( string -- hashtable )
525 ebnf-parser (parse) check-parse-result ast>> transform ;
527 : ebnf>quot ( string -- hashtable quot )
528 parse-ebnf dup dup parser [ main of compile ] with-variable
529 [ compiled-parse ] curry [ with-scope ast>> ] curry ;
535 reset-tokenizer parse-multiline-string parse-ebnf main of
536 suffix! reset-tokenizer ;
540 reset-tokenizer parse-multiline-string ebnf>quot nip
541 suffix! \ call suffix! reset-tokenizer ;
544 reset-tokenizer scan-new-word dup ";EBNF" parse-multiline-string
546 ( input -- ast ) define-declared "ebnf-parser" set-word-prop