1 ! Copyright (C) 2007 Chris Double.
2 ! See https://factorcode.org/license.txt for BSD license.
4 USING: accessors arrays assocs combinators
5 combinators.short-circuit definitions effects kernel make
6 math.parser multiline namespaces parser peg peg.parsers
7 peg.private peg.search quotations sequences sequences.deep
8 splitting stack-checker strings strings.parser summary unicode
13 : rule ( name word -- parser )
14 ! Given an EBNF word produced from EBNF: return the EBNF rule
15 "ebnf-parser" word-prop at ;
17 ERROR: no-rule rule parser ;
21 : lookup-rule ( rule parser -- rule' )
22 2dup rule [ 2nip ] [ no-rule ] if* ;
24 TUPLE: tokenizer-tuple any one many ;
26 : default-tokenizer ( -- tokenizer )
30 [ [ = ] curry any-char swap semantic ]
33 : parser-tokenizer ( parser -- tokenizer )
35 [ swap [ = ] curry semantic ] curry dup tokenizer-tuple boa ;
37 : rule-tokenizer ( name word -- tokenizer )
38 rule parser-tokenizer ;
40 : tokenizer ( -- word )
41 \ tokenizer get-global [ default-tokenizer ] unless* ;
43 : reset-tokenizer ( -- )
44 default-tokenizer \ tokenizer set-global ;
46 TUPLE: ebnf-non-terminal symbol ;
47 TUPLE: ebnf-terminal symbol ;
48 TUPLE: ebnf-foreign word rule ;
49 TUPLE: ebnf-any-character ;
50 TUPLE: ebnf-range pattern ;
51 TUPLE: ebnf-ensure group ;
52 TUPLE: ebnf-ensure-not group ;
53 TUPLE: ebnf-choice options ;
54 TUPLE: ebnf-sequence elements ;
55 TUPLE: ebnf-repeat0 group ;
56 TUPLE: ebnf-repeat1 group ;
57 TUPLE: ebnf-ignore group ;
58 TUPLE: ebnf-optional group ;
59 TUPLE: ebnf-whitespace group ;
60 TUPLE: ebnf-tokenizer elements ;
61 TUPLE: ebnf-rule symbol elements ;
62 TUPLE: ebnf-action parser code ;
63 TUPLE: ebnf-var parser name ;
64 TUPLE: ebnf-semantic parser code ;
67 C: <ebnf-non-terminal> ebnf-non-terminal
68 C: <ebnf-terminal> ebnf-terminal
69 C: <ebnf-foreign> ebnf-foreign
70 C: <ebnf-any-character> ebnf-any-character
71 C: <ebnf-range> ebnf-range
72 C: <ebnf-ensure> ebnf-ensure
73 C: <ebnf-ensure-not> ebnf-ensure-not
74 C: <ebnf-choice> ebnf-choice
75 C: <ebnf-sequence> ebnf-sequence
76 C: <ebnf-repeat0> ebnf-repeat0
77 C: <ebnf-repeat1> ebnf-repeat1
78 C: <ebnf-ignore> ebnf-ignore
79 C: <ebnf-optional> ebnf-optional
80 C: <ebnf-whitespace> ebnf-whitespace
81 C: <ebnf-tokenizer> ebnf-tokenizer
82 C: <ebnf-rule> ebnf-rule
83 C: <ebnf-action> ebnf-action
84 C: <ebnf-var> ebnf-var
85 C: <ebnf-semantic> ebnf-semantic
88 : filter-hidden ( seq -- seq )
89 ! Remove elements that produce no AST from sequence
90 [ ebnf-ensure-not? ] reject [ ebnf-ensure? ] reject
91 [ ebnf-ignore? ] reject ;
93 : syntax ( string -- parser )
94 ! Parses the string, ignoring white space, and
95 ! does not put the result in the AST.
98 : syntax-pack ( begin parser end -- parser )
99 ! Parse parser-parser surrounded by syntax elements
101 [ syntax ] 2dip syntax pack ;
103 : insert-escapes ( string -- string )
105 "\t" token [ drop "\\t" ] action ,
106 "\n" token [ drop "\\n" ] action ,
107 "\r" token [ drop "\\r" ] action ,
108 ] choice* peg-replace ;
110 : identifier-parser ( -- parser )
111 ! Return a parser that parses an identifier delimited by
112 ! a quotation character. The quotation can be single
113 ! or double quotes. The AST produced is the identifier
114 ! between the quotes.
117 [ CHAR: \ = ] satisfy
118 [ "\"\\" member? ] satisfy 2seq ,
119 [ CHAR: \" = not ] satisfy ,
120 ] choice* repeat1 "\"" "\"" surrounded-by ,
121 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
122 ] choice* [ "" flatten-as unescape-string ] action ;
124 : non-terminal-parser ( -- parser )
125 ! A non-terminal is the name of another rule. It can
126 ! be any non-blank character except for characters used
127 ! in the EBNF syntax itself.
131 [ "\"'|{}=)(][.!&*+?:~<>" member? ]
133 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
135 : terminal-parser ( -- parser )
136 ! A terminal is an identifier enclosed in quotations
137 ! and it represents the literal value of the identifier.
138 identifier-parser [ <ebnf-terminal> ] action ;
140 : foreign-name-parser ( -- parser )
141 ! Parse a valid foreign parser name
147 ] satisfy repeat1 [ >string ] action ;
149 : foreign-parser ( -- parser )
150 ! A foreign call is a call to a rule in another ebnf grammar
153 foreign-name-parser sp ,
154 foreign-name-parser sp optional ,
156 ] seq* [ first2 <ebnf-foreign> ] action ;
158 : any-character-parser ( -- parser )
159 ! A parser to match the symbol for any character match.
160 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
162 : range-parser ( -- parser )
163 ! Match the syntax for declaring character ranges
165 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
167 "\\]" token [ second ] action ,
168 [ CHAR: ] = not ] satisfy ,
171 ] seq* [ first >string unescape-string <ebnf-range> ] action ;
173 : (element-parser) ( -- parser )
174 ! An element of a rule. It can be a terminal or a
175 ! non-terminal but must not be followed by a "=".
176 ! The latter indicates that it is the beginning of a
181 non-terminal-parser ,
185 any-character-parser ,
187 [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
188 [ dup , "*~" token hide , ] seq* [ first <ebnf-repeat0> <ebnf-ignore> ] action ,
189 [ dup , "+~" token hide , ] seq* [ first <ebnf-repeat1> <ebnf-ignore> ] action ,
190 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
191 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
192 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
196 "=" syntax ensure-not ,
199 ] seq* [ first ] action ;
203 : element-parser ( -- parser )
206 (element-parser) , ":" syntax ,
207 "a-zA-Z_" range-pattern
208 "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
209 ] seq* [ first2 <ebnf-var> ] action ,
215 : grouped ( quot suffix -- parser )
216 ! Parse a group of choices, with a suffix indicating
217 ! the type of group (repeat0, repeat1, etc) and
218 ! an quot that is the action that produces the AST.
221 "(" [ choice-parser sp ] delay ")" syntax-pack
223 [ first ] rot compose action ,
224 "{" [ choice-parser sp ] delay "}" syntax-pack
226 [ first <ebnf-whitespace> ] rot compose action ,
229 : group-parser ( -- parser )
230 ! A grouping with no suffix. Used for precedence.
232 "~" token sp ensure-not ,
233 "*" token sp ensure-not ,
234 "+" token sp ensure-not ,
235 "?" token sp ensure-not ,
236 ] seq* hide grouped ;
238 : repeat0-parser ( -- parser )
239 [ <ebnf-repeat0> ] "*" syntax grouped ;
241 : repeat1-parser ( -- parser )
242 [ <ebnf-repeat1> ] "+" syntax grouped ;
244 : ignore-parser ( -- parser )
245 [ <ebnf-ignore> ] "~" syntax grouped ;
247 : ignore-repeat0-parser ( -- parser )
248 [ <ebnf-repeat0> <ebnf-ignore> ] "*~" syntax grouped ;
250 : ignore-repeat1-parser ( -- parser )
251 [ <ebnf-repeat1> <ebnf-ignore> ] "+~" syntax grouped ;
253 : optional-parser ( -- parser )
254 [ <ebnf-optional> ] "?" syntax grouped ;
256 : factor-code-parser ( -- parser )
258 "]]" token ensure-not ,
259 "]?" token ensure-not ,
261 ] seq* repeat0 [ "" concat-as ] action ;
263 : ensure-not-parser ( -- parser )
264 ! Parses the '!' syntax to ensure that
265 ! something that matches the following elements do
266 ! not exist in the parse stream.
270 ] seq* [ first <ebnf-ensure-not> ] action ;
272 : ensure-parser ( -- parser )
273 ! Parses the '&' syntax to ensure that
274 ! something that matches the following elements does
275 ! exist in the parse stream.
279 ] seq* [ first <ebnf-ensure> ] action ;
281 : (sequence-parser) ( -- parser )
282 ! A sequence of terminals and non-terminals, including
283 ! groupings of those.
286 ensure-not-parser sp ,
291 ignore-repeat0-parser sp ,
292 ignore-repeat1-parser sp ,
297 dup , ":" syntax , "a-zA-Z" range-pattern repeat1
299 ] seq* [ first2 <ebnf-var> ] action ,
303 : action-parser ( -- parser )
304 "[[" factor-code-parser "]]" syntax-pack ;
306 : semantic-parser ( -- parser )
307 "?[" factor-code-parser "]?" syntax-pack ;
309 : sequence-parser ( -- parser )
310 ! A sequence of terminals and non-terminals, including
311 ! groupings of those.
313 [ (sequence-parser) , action-parser , ] seq*
314 [ first2 <ebnf-action> ] action ,
316 [ (sequence-parser) , semantic-parser , ] seq*
317 [ first2 <ebnf-semantic> ] action ,
321 dup length 1 = [ first ] [ <ebnf-sequence> ] if
324 : actioned-sequence-parser ( -- parser )
326 [ sequence-parser , "=>" syntax , action-parser , ] seq*
327 [ first2 <ebnf-action> ] action ,
331 : choice-parser ( -- parser )
332 actioned-sequence-parser sp repeat1 [
333 dup length 1 = [ first ] [ <ebnf-sequence> ] if
334 ] action "|" token sp list-of [
335 dup length 1 = [ first ] [ <ebnf-choice> ] if
338 : tokenizer-parser ( -- parser )
342 ">" token ensure-not ,
343 [ "default" token sp , choice-parser , ] choice* ,
344 ] seq* [ first <ebnf-tokenizer> ] action ;
346 : rule-parser ( -- parser )
348 "tokenizer" token ensure-not ,
349 non-terminal-parser [ symbol>> ] action ,
351 ">" token ensure-not ,
353 ] seq* [ first2 <ebnf-rule> ] action ;
355 : ebnf-parser ( -- parser )
356 [ tokenizer-parser sp , rule-parser sp , ] choice* repeat1 [ <ebnf> ] action ;
358 GENERIC: (transform) ( ast -- parser )
364 : transform ( ast -- object )
373 rules>> [ (transform) ] map last ;
375 M: ebnf-tokenizer (transform)
376 elements>> dup "default" = [
377 drop default-tokenizer \ tokenizer set-global any-char
380 dup parser-tokenizer \ tokenizer set-global
383 ERROR: redefined-rule name ;
385 M: redefined-rule summary
386 name>> "Rule '" "' defined more than once" surround ;
388 M: ebnf-rule (transform)
391 swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
394 M: ebnf-sequence (transform)
395 ! If ignore-ws is set then each element of the sequence
396 ! ignores leading whitespace. This is not inherited by
397 ! subelements of the sequence.
399 f ignore-ws [ (transform) ] with-variable
400 ignore-ws get [ sp ] when
401 ] map seq [ dup length 1 = [ first ] when ] action ;
403 M: ebnf-choice (transform)
404 options>> [ (transform) ] map choice ;
406 M: ebnf-any-character (transform)
407 drop tokenizer any>> call( -- parser ) ;
409 M: ebnf-range (transform)
410 pattern>> range-pattern ;
412 : transform-group ( ast -- parser )
413 ! convert a ast node with groups to a parser for that group
414 group>> (transform) ;
416 M: ebnf-ensure (transform)
417 transform-group ensure ;
419 M: ebnf-ensure-not (transform)
420 transform-group ensure-not ;
422 M: ebnf-ignore (transform)
423 transform-group [ drop ignore ] action ;
425 M: ebnf-repeat0 (transform)
426 transform-group repeat0 ;
428 M: ebnf-repeat1 (transform)
429 transform-group repeat1 ;
431 M: ebnf-optional (transform)
432 transform-group optional ;
434 M: ebnf-whitespace (transform)
435 t ignore-ws [ transform-group ] with-variable ;
437 GENERIC: build-locals ( code ast -- code )
439 M: ebnf-sequence build-locals
440 ! Note the need to filter out this ebnf items that
441 ! leave nothing in the AST
442 elements>> filter-hidden dup length 1 = [
445 dup [ ebnf-var? ] none? [
452 " " % # " over nth :> " %
465 M: ebnf-var build-locals
467 "[let dup :> " % name>> %
473 M: ebnf-whitespace build-locals
474 group>> build-locals ;
476 M: object build-locals
479 ERROR: bad-effect quot effect ;
481 : check-action-effect ( quot -- quot )
483 { [ dup ( a -- b ) effect<= ] [ drop ] }
484 { [ dup ( -- b ) effect<= ] [ drop [ drop ] prepose ] }
488 : ebnf-transform ( ast -- parser quot )
489 [ parser>> (transform) ]
490 [ code>> insert-escapes ]
491 [ parser>> ] tri build-locals
493 { "dup" dup } { "nip" nip } { "over" over } ! kernel
494 { "nth" nth } ! sequences
495 } [ split-lines parse-lines ] with-words ;
497 M: ebnf-action (transform)
498 ebnf-transform check-action-effect action ;
500 M: ebnf-semantic (transform)
501 ebnf-transform semantic ;
503 M: ebnf-var (transform)
504 parser>> (transform) ;
506 M: ebnf-terminal (transform)
507 symbol>> tokenizer one>> call( symbol -- parser ) ;
509 ERROR: ebnf-foreign-not-found name ;
511 M: ebnf-foreign-not-found summary
512 name>> "Foreign word '" "' not found" surround ;
514 M: ebnf-foreign (transform)
515 dup word>> search [ word>> ebnf-foreign-not-found ] unless*
516 swap rule>> [ main ] unless* over rule [
522 ERROR: parser-not-found name ;
524 M: ebnf-non-terminal (transform)
526 '[ _ dup _ at [ parser-not-found ] unless* nip ] box ;
528 : parse-ebnf ( string -- hashtable )
529 ebnf-parser parse-fully transform ;
531 : ebnf>quot ( string -- hashtable quot: ( string -- results ) )
532 parse-ebnf dup dup parser [ main of compile-parser ] with-variable
533 '[ [ _ perform-parse ] with-scope ] ;
539 scan-new-word dup scan-object
541 [ "ebnf-quot" set-word-prop ] 2keep
542 [ check-parse-result ast>> ] compose
543 ( input -- ast ) define-declared
544 "ebnf-parser" set-word-prop ;
546 SYNTAX: PARTIAL-EBNF:
548 scan-word "ebnf-quot" word-prop
550 ( input -- ast ) define-declared ;
552 PREDICATE: ebnf-word < word "ebnf-quot" word-prop >boolean ;
554 M: ebnf-word reset-word
556 [ "ebnf-quot" word-prop first first forget ]
557 [ { "ebnf-quot" "ebnf-parser" } remove-word-props ] tri ;
560 [ call-next-method ] [ "ebnf-quot" word-prop first first forget ] bi ;
562 : define-inline-ebnf ( ast string -- quot )
564 ebnf>quot [ check-parse-result ast>> ] compose nip
565 suffix! \ call suffix! reset-tokenizer ;
567 : define-partial-inline-ebnf ( ast string -- quot )
569 ebnf>quot [ ast>> ] compose nip
570 suffix! \ call suffix! reset-tokenizer ;
572 SYNTAX: EBNF[[ "]]" parse-multiline-string define-inline-ebnf ;
573 SYNTAX: EBNF[=[ "]=]" parse-multiline-string define-inline-ebnf ;
574 SYNTAX: EBNF[==[ "]==]" parse-multiline-string define-inline-ebnf ;
575 SYNTAX: EBNF[===[ "]===]" parse-multiline-string define-inline-ebnf ;
576 SYNTAX: EBNF[====[ "]====]" parse-multiline-string define-inline-ebnf ;
578 SYNTAX: PARTIAL-EBNF[[ "]]" parse-multiline-string define-partial-inline-ebnf ;
579 SYNTAX: PARTIAL-EBNF[=[ "]=]" parse-multiline-string define-partial-inline-ebnf ;
580 SYNTAX: PARTIAL-EBNF[==[ "]==]" parse-multiline-string define-partial-inline-ebnf ;
581 SYNTAX: PARTIAL-EBNF[===[ "]===]" parse-multiline-string define-partial-inline-ebnf ;
582 SYNTAX: PARTIAL-EBNF[====[ "]====]" parse-multiline-string define-partial-inline-ebnf ;
587 scan-object parse-ebnf main of '[ _ ]
588 ( -- parser ) define-declared