1 ! Copyright (C) 2007 Chris Double.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors assocs combinators combinators.short-circuit
4 effects fry kernel make math math.parser multiline namespaces
5 parser peg peg.parsers quotations sequences sequences.deep
6 splitting stack-checker strings strings.parser summary unicode
8 FROM: vocabs.parser => search ;
9 FROM: peg.search => replace ;
12 : rule ( name word -- parser )
13 ! Given an EBNF word produced from EBNF: return the EBNF rule
14 "ebnf-parser" word-prop at ;
16 ERROR: no-rule rule parser ;
20 : lookup-rule ( rule parser -- rule' )
21 2dup rule [ 2nip ] [ no-rule ] if* ;
23 TUPLE: tokenizer-tuple any one many ;
25 : default-tokenizer ( -- tokenizer )
29 [ [ = ] curry any-char swap semantic ]
32 : parser-tokenizer ( parser -- tokenizer )
34 [ swap [ = ] curry semantic ] curry dup tokenizer-tuple boa ;
36 : rule-tokenizer ( name word -- tokenizer )
37 rule parser-tokenizer ;
39 : tokenizer ( -- word )
40 \ tokenizer get-global [ default-tokenizer ] unless* ;
42 : reset-tokenizer ( -- )
43 default-tokenizer \ tokenizer set-global ;
45 TUPLE: ebnf-non-terminal symbol ;
46 TUPLE: ebnf-terminal symbol ;
47 TUPLE: ebnf-foreign word rule ;
48 TUPLE: ebnf-any-character ;
49 TUPLE: ebnf-range pattern ;
50 TUPLE: ebnf-ensure group ;
51 TUPLE: ebnf-ensure-not group ;
52 TUPLE: ebnf-choice options ;
53 TUPLE: ebnf-sequence elements ;
54 TUPLE: ebnf-repeat0 group ;
55 TUPLE: ebnf-repeat1 group ;
56 TUPLE: ebnf-ignore group ;
57 TUPLE: ebnf-ignore-repeat0 group ;
58 TUPLE: ebnf-ignore-repeat1 group ;
59 TUPLE: ebnf-optional group ;
60 TUPLE: ebnf-whitespace group ;
61 TUPLE: ebnf-tokenizer elements ;
62 TUPLE: ebnf-rule symbol elements ;
63 TUPLE: ebnf-action parser code ;
64 TUPLE: ebnf-var parser name ;
65 TUPLE: ebnf-semantic parser code ;
68 C: <ebnf-non-terminal> ebnf-non-terminal
69 C: <ebnf-terminal> ebnf-terminal
70 C: <ebnf-foreign> ebnf-foreign
71 C: <ebnf-any-character> ebnf-any-character
72 C: <ebnf-range> ebnf-range
73 C: <ebnf-ensure> ebnf-ensure
74 C: <ebnf-ensure-not> ebnf-ensure-not
75 C: <ebnf-choice> ebnf-choice
76 C: <ebnf-sequence> ebnf-sequence
77 C: <ebnf-repeat0> ebnf-repeat0
78 C: <ebnf-repeat1> ebnf-repeat1
79 C: <ebnf-ignore> ebnf-ignore
80 C: <ebnf-ignore-repeat0> ebnf-ignore-repeat0
81 C: <ebnf-ignore-repeat1> ebnf-ignore-repeat1
82 C: <ebnf-optional> ebnf-optional
83 C: <ebnf-whitespace> ebnf-whitespace
84 C: <ebnf-tokenizer> ebnf-tokenizer
85 C: <ebnf-rule> ebnf-rule
86 C: <ebnf-action> ebnf-action
87 C: <ebnf-var> ebnf-var
88 C: <ebnf-semantic> ebnf-semantic
93 INSTANCE: ebnf-ignore ebnf-ignored
94 INSTANCE: ebnf-ignore-repeat0 ebnf-ignored
95 INSTANCE: ebnf-ignore-repeat1 ebnf-ignored
97 : filter-hidden ( seq -- seq )
98 ! Remove elements that produce no AST from sequence
99 [ ebnf-ensure-not? ] reject [ ebnf-ensure? ] reject
100 [ ebnf-ignored? ] reject ;
102 : syntax ( string -- parser )
103 ! Parses the string, ignoring white space, and
104 ! does not put the result in the AST.
107 : syntax-pack ( begin parser end -- parser )
108 ! Parse parser-parser surrounded by syntax elements
110 [ syntax ] 2dip syntax pack ;
112 : insert-escapes ( string -- string )
114 "\t" token [ drop "\\t" ] action ,
115 "\n" token [ drop "\\n" ] action ,
116 "\r" token [ drop "\\r" ] action ,
119 : identifier-parser ( -- parser )
120 ! Return a parser that parses an identifer delimited by
121 ! a quotation character. The quotation can be single
122 ! or double quotes. The AST produced is the identifier
123 ! between the quotes.
126 [ CHAR: \ = ] satisfy
127 [ "\"\\" member? ] satisfy 2seq ,
128 [ CHAR: \" = not ] satisfy ,
129 ] choice* repeat1 "\"" "\"" surrounded-by ,
130 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
131 ] choice* [ "" flatten-as unescape-string ] action ;
133 : non-terminal-parser ( -- parser )
134 ! A non-terminal is the name of another rule. It can
135 ! be any non-blank character except for characters used
136 ! in the EBNF syntax itself.
140 [ "\"'|{}=)(][.!&*+?:~<>" member? ]
142 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
144 : terminal-parser ( -- parser )
145 ! A terminal is an identifier enclosed in quotations
146 ! and it represents the literal value of the identifier.
147 identifier-parser [ <ebnf-terminal> ] action ;
149 : foreign-name-parser ( -- parser )
150 ! Parse a valid foreign parser name
156 ] satisfy repeat1 [ >string ] action ;
158 : foreign-parser ( -- parser )
159 ! A foreign call is a call to a rule in another ebnf grammar
162 foreign-name-parser sp ,
163 foreign-name-parser sp optional ,
165 ] seq* [ first2 <ebnf-foreign> ] action ;
167 : any-character-parser ( -- parser )
168 ! A parser to match the symbol for any character match.
169 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
171 : range-parser ( -- parser )
172 ! Match the syntax for declaring character ranges
174 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
176 "\\]" token [ second ] action ,
177 [ CHAR: ] = not ] satisfy ,
180 ] seq* [ first >string unescape-string <ebnf-range> ] action ;
182 : (element-parser) ( -- parser )
183 ! An element of a rule. It can be a terminal or a
184 ! non-terminal but must not be followed by a "=".
185 ! The latter indicates that it is the beginning of a
190 non-terminal-parser ,
194 any-character-parser ,
196 [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
197 [ dup , "*~" token hide , ] seq* [ first <ebnf-ignore-repeat0> ] action ,
198 [ dup , "+~" token hide , ] seq* [ first <ebnf-ignore-repeat1> ] action ,
199 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
200 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
201 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
205 "=" syntax ensure-not ,
208 ] seq* [ first ] action ;
212 : element-parser ( -- parser )
215 (element-parser) , ":" syntax ,
216 "a-zA-Z_" range-pattern
217 "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
218 ] seq* [ first2 <ebnf-var> ] action ,
224 : grouped ( quot suffix -- parser )
225 ! Parse a group of choices, with a suffix indicating
226 ! the type of group (repeat0, repeat1, etc) and
227 ! an quot that is the action that produces the AST.
230 "(" [ choice-parser sp ] delay ")" syntax-pack
232 [ first ] rot compose action ,
233 "{" [ choice-parser sp ] delay "}" syntax-pack
235 [ first <ebnf-whitespace> ] rot compose action ,
238 : group-parser ( -- parser )
239 ! A grouping with no suffix. Used for precedence.
241 "~" token sp ensure-not ,
242 "*" token sp ensure-not ,
243 "+" token sp ensure-not ,
244 "?" token sp ensure-not ,
245 ] seq* hide grouped ;
247 : repeat0-parser ( -- parser )
248 [ <ebnf-repeat0> ] "*" syntax grouped ;
250 : repeat1-parser ( -- parser )
251 [ <ebnf-repeat1> ] "+" syntax grouped ;
253 : ignore-parser ( -- parser )
254 [ <ebnf-ignore> ] "~" syntax grouped ;
256 : ignore-repeat0-parser ( -- parser )
257 [ <ebnf-ignore-repeat0> ] "*~" syntax grouped ;
259 : ignore-repeat1-parser ( -- parser )
260 [ <ebnf-ignore-repeat1> ] "+~" syntax grouped ;
262 : optional-parser ( -- parser )
263 [ <ebnf-optional> ] "?" syntax grouped ;
265 : factor-code-parser ( -- parser )
267 "]]" token ensure-not ,
268 "]?" token ensure-not ,
270 ] seq* repeat0 [ "" concat-as ] action ;
272 : ensure-not-parser ( -- parser )
273 ! Parses the '!' syntax to ensure that
274 ! something that matches the following elements do
275 ! not exist in the parse stream.
279 ] seq* [ first <ebnf-ensure-not> ] action ;
281 : ensure-parser ( -- parser )
282 ! Parses the '&' syntax to ensure that
283 ! something that matches the following elements does
284 ! exist in the parse stream.
288 ] seq* [ first <ebnf-ensure> ] action ;
290 : (sequence-parser) ( -- parser )
291 ! A sequence of terminals and non-terminals, including
292 ! groupings of those.
295 ensure-not-parser sp ,
300 ignore-repeat0-parser sp ,
301 ignore-repeat1-parser sp ,
306 dup , ":" syntax , "a-zA-Z" range-pattern repeat1
308 ] seq* [ first2 <ebnf-var> ] action ,
312 : action-parser ( -- parser )
313 "[[" factor-code-parser "]]" syntax-pack ;
315 : semantic-parser ( -- parser )
316 "?[" factor-code-parser "]?" syntax-pack ;
318 : sequence-parser ( -- parser )
319 ! A sequence of terminals and non-terminals, including
320 ! groupings of those.
322 [ (sequence-parser) , action-parser , ] seq*
323 [ first2 <ebnf-action> ] action ,
325 [ (sequence-parser) , semantic-parser , ] seq*
326 [ first2 <ebnf-semantic> ] action ,
330 dup length 1 = [ first ] [ <ebnf-sequence> ] if
333 : actioned-sequence-parser ( -- parser )
335 [ sequence-parser , "=>" syntax , action-parser , ] seq*
336 [ first2 <ebnf-action> ] action ,
340 : choice-parser ( -- parser )
341 actioned-sequence-parser sp repeat1 [
342 dup length 1 = [ first ] [ <ebnf-sequence> ] if
343 ] action "|" token sp list-of [
344 dup length 1 = [ first ] [ <ebnf-choice> ] if
347 : tokenizer-parser ( -- parser )
351 ">" token ensure-not ,
352 [ "default" token sp , choice-parser , ] choice* ,
353 ] seq* [ first <ebnf-tokenizer> ] action ;
355 : rule-parser ( -- parser )
357 "tokenizer" token ensure-not ,
358 non-terminal-parser [ symbol>> ] action ,
360 ">" token ensure-not ,
362 ] seq* [ first2 <ebnf-rule> ] action ;
364 : ebnf-parser ( -- parser )
365 [ tokenizer-parser sp , rule-parser sp , ] choice* repeat1 [ <ebnf> ] action ;
367 GENERIC: (transform) ( ast -- parser )
373 : transform ( ast -- object )
382 rules>> [ (transform) ] map last ;
384 M: ebnf-tokenizer (transform)
385 elements>> dup "default" = [
386 drop default-tokenizer \ tokenizer set-global any-char
389 dup parser-tokenizer \ tokenizer set-global
392 ERROR: redefined-rule name ;
394 M: redefined-rule summary
395 name>> "Rule '" "' defined more than once" surround ;
397 M: ebnf-rule (transform)
400 swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
403 M: ebnf-sequence (transform)
404 ! If ignore-ws is set then each element of the sequence
405 ! ignores leading whitespace. This is not inherited by
406 ! subelements of the sequence.
408 f ignore-ws [ (transform) ] with-variable
409 ignore-ws get [ sp ] when
410 ] map seq [ dup length 1 = [ first ] when ] action ;
412 M: ebnf-choice (transform)
413 options>> [ (transform) ] map choice ;
415 M: ebnf-any-character (transform)
416 drop tokenizer any>> call( -- parser ) ;
418 M: ebnf-range (transform)
419 pattern>> range-pattern ;
421 : transform-group ( ast -- parser )
422 ! convert a ast node with groups to a parser for that group
423 group>> (transform) ;
425 M: ebnf-ensure (transform)
426 transform-group ensure ;
428 M: ebnf-ensure-not (transform)
429 transform-group ensure-not ;
431 M: ebnf-ignore (transform)
432 transform-group [ drop ignore ] action ;
434 M: ebnf-ignore-repeat0 (transform)
435 transform-group repeat0 hide ;
437 M: ebnf-ignore-repeat1 (transform)
438 transform-group repeat1 hide ;
440 M: ebnf-repeat0 (transform)
441 transform-group repeat0 ;
443 M: ebnf-repeat1 (transform)
444 transform-group repeat1 ;
446 M: ebnf-optional (transform)
447 transform-group optional ;
449 M: ebnf-whitespace (transform)
450 t ignore-ws [ transform-group ] with-variable ;
452 GENERIC: build-locals ( code ast -- code )
454 M: ebnf-sequence build-locals
455 ! Note the need to filter out this ebnf items that
456 ! leave nothing in the AST
457 elements>> filter-hidden dup length 1 = [
460 dup [ ebnf-var? ] none? [
467 " " % # " over nth :> " %
480 M: ebnf-var build-locals
482 "[let dup :> " % name>> %
488 M: object build-locals
491 ERROR: bad-effect quot effect ;
493 : check-action-effect ( quot -- quot )
495 { [ dup ( a -- b ) effect<= ] [ drop ] }
496 { [ dup ( -- b ) effect<= ] [ drop [ drop ] prepose ] }
500 : ebnf-transform ( ast -- parser quot )
501 [ parser>> (transform) ]
502 [ code>> insert-escapes ]
503 [ parser>> ] tri build-locals
505 { "dup" dup } { "nip" nip } { "over" over } ! kernel
506 { "nth" nth } ! sequences
507 } [ string-lines parse-lines ] with-words ;
509 M: ebnf-action (transform)
510 ebnf-transform check-action-effect action ;
512 M: ebnf-semantic (transform)
513 ebnf-transform semantic ;
515 M: ebnf-var (transform)
516 parser>> (transform) ;
518 M: ebnf-terminal (transform)
519 symbol>> tokenizer one>> call( symbol -- parser ) ;
521 ERROR: ebnf-foreign-not-found name ;
523 M: ebnf-foreign-not-found summary
524 name>> "Foreign word '" "' not found" surround ;
526 M: ebnf-foreign (transform)
527 dup word>> search [ word>> ebnf-foreign-not-found ] unless*
528 swap rule>> [ main ] unless* over rule [
534 ERROR: parser-not-found name ;
536 M: ebnf-non-terminal (transform)
538 '[ _ dup _ at [ parser-not-found ] unless* nip ] box ;
540 : transform-ebnf ( string -- object )
541 ebnf-parser parse transform ;
543 ERROR: unable-to-fully-parse-ebnf remaining ;
545 ERROR: could-not-parse-ebnf ;
547 : check-parse-result ( result -- result )
549 dup remaining>> [ blank? ] trim [
550 unable-to-fully-parse-ebnf
556 : parse-ebnf ( string -- hashtable )
557 ebnf-parser (parse) check-parse-result ast>> transform ;
559 : ebnf>quot ( string -- hashtable quot: ( string -- results ) )
560 parse-ebnf dup dup parser [ main of compile ] with-variable
561 '[ [ _ compiled-parse ] with-scope ] ;
567 scan-new-word dup scan-object
569 [ "ebnf-quot" set-word-prop ] 2keep
570 [ check-parse-result ast>> ] compose
571 ( input -- ast ) define-declared
572 "ebnf-parser" set-word-prop ;
574 SYNTAX: PARTIAL-EBNF:
576 scan-word "ebnf-quot" word-prop
578 ( input -- ast ) define-declared ;
580 : define-inline-ebnf ( ast string -- quot )
582 ebnf>quot [ check-parse-result ast>> ] compose nip
583 suffix! \ call suffix! reset-tokenizer ;
585 : define-partial-inline-ebnf ( ast string -- quot )
587 ebnf>quot [ ast>> ] compose nip
588 suffix! \ call suffix! reset-tokenizer ;
590 SYNTAX: EBNF[[ "]]" parse-multiline-string define-inline-ebnf ;
591 SYNTAX: EBNF[=[ "]=]" parse-multiline-string define-inline-ebnf ;
592 SYNTAX: EBNF[==[ "]==]" parse-multiline-string define-inline-ebnf ;
593 SYNTAX: EBNF[===[ "]===]" parse-multiline-string define-inline-ebnf ;
594 SYNTAX: EBNF[====[ "]====]" parse-multiline-string define-inline-ebnf ;
596 SYNTAX: PARTIAL-EBNF[[ "]]" parse-multiline-string define-partial-inline-ebnf ;
597 SYNTAX: PARTIAL-EBNF[=[ "]=]" parse-multiline-string define-partial-inline-ebnf ;
598 SYNTAX: PARTIAL-EBNF[==[ "]==]" parse-multiline-string define-partial-inline-ebnf ;
599 SYNTAX: PARTIAL-EBNF[===[ "]===]" parse-multiline-string define-partial-inline-ebnf ;
600 SYNTAX: PARTIAL-EBNF[====[ "]====]" parse-multiline-string define-partial-inline-ebnf ;
605 scan-object parse-ebnf main of '[ _ ]
606 ( -- parser ) define-declared