1 ! Copyright (C) 2007 Chris Double.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors assocs combinators combinators.short-circuit
4 effects kernel make math.parser multiline namespaces parser peg
5 peg.parsers quotations sequences sequences.deep splitting
6 stack-checker strings strings.parser summary unicode
8 FROM: vocabs.parser => search ;
9 FROM: peg.search => replace ;
12 : rule ( name word -- parser )
13 ! Given an EBNF word produced from EBNF: return the EBNF rule
14 "ebnf-parser" word-prop at ;
16 ERROR: no-rule rule parser ;
20 : lookup-rule ( rule parser -- rule' )
21 2dup rule [ 2nip ] [ no-rule ] if* ;
23 TUPLE: tokenizer-tuple any one many ;
25 : default-tokenizer ( -- tokenizer )
29 [ [ = ] curry any-char swap semantic ]
32 : parser-tokenizer ( parser -- tokenizer )
34 [ swap [ = ] curry semantic ] curry dup tokenizer-tuple boa ;
36 : rule-tokenizer ( name word -- tokenizer )
37 rule parser-tokenizer ;
39 : tokenizer ( -- word )
40 \ tokenizer get-global [ default-tokenizer ] unless* ;
42 : reset-tokenizer ( -- )
43 default-tokenizer \ tokenizer set-global ;
45 TUPLE: ebnf-non-terminal symbol ;
46 TUPLE: ebnf-terminal symbol ;
47 TUPLE: ebnf-foreign word rule ;
48 TUPLE: ebnf-any-character ;
49 TUPLE: ebnf-range pattern ;
50 TUPLE: ebnf-ensure group ;
51 TUPLE: ebnf-ensure-not group ;
52 TUPLE: ebnf-choice options ;
53 TUPLE: ebnf-sequence elements ;
54 TUPLE: ebnf-repeat0 group ;
55 TUPLE: ebnf-repeat1 group ;
56 TUPLE: ebnf-ignore group ;
57 TUPLE: ebnf-optional group ;
58 TUPLE: ebnf-whitespace group ;
59 TUPLE: ebnf-tokenizer elements ;
60 TUPLE: ebnf-rule symbol elements ;
61 TUPLE: ebnf-action parser code ;
62 TUPLE: ebnf-var parser name ;
63 TUPLE: ebnf-semantic parser code ;
66 C: <ebnf-non-terminal> ebnf-non-terminal
67 C: <ebnf-terminal> ebnf-terminal
68 C: <ebnf-foreign> ebnf-foreign
69 C: <ebnf-any-character> ebnf-any-character
70 C: <ebnf-range> ebnf-range
71 C: <ebnf-ensure> ebnf-ensure
72 C: <ebnf-ensure-not> ebnf-ensure-not
73 C: <ebnf-choice> ebnf-choice
74 C: <ebnf-sequence> ebnf-sequence
75 C: <ebnf-repeat0> ebnf-repeat0
76 C: <ebnf-repeat1> ebnf-repeat1
77 C: <ebnf-ignore> ebnf-ignore
78 C: <ebnf-optional> ebnf-optional
79 C: <ebnf-whitespace> ebnf-whitespace
80 C: <ebnf-tokenizer> ebnf-tokenizer
81 C: <ebnf-rule> ebnf-rule
82 C: <ebnf-action> ebnf-action
83 C: <ebnf-var> ebnf-var
84 C: <ebnf-semantic> ebnf-semantic
87 : filter-hidden ( seq -- seq )
88 ! Remove elements that produce no AST from sequence
89 [ ebnf-ensure-not? ] reject [ ebnf-ensure? ] reject
90 [ ebnf-ignore? ] reject ;
92 : syntax ( string -- parser )
93 ! Parses the string, ignoring white space, and
94 ! does not put the result in the AST.
97 : syntax-pack ( begin parser end -- parser )
98 ! Parse parser-parser surrounded by syntax elements
100 [ syntax ] 2dip syntax pack ;
102 : insert-escapes ( string -- string )
104 "\t" token [ drop "\\t" ] action ,
105 "\n" token [ drop "\\n" ] action ,
106 "\r" token [ drop "\\r" ] action ,
109 : identifier-parser ( -- parser )
110 ! Return a parser that parses an identifier delimited by
111 ! a quotation character. The quotation can be single
112 ! or double quotes. The AST produced is the identifier
113 ! between the quotes.
116 [ CHAR: \ = ] satisfy
117 [ "\"\\" member? ] satisfy 2seq ,
118 [ CHAR: \" = not ] satisfy ,
119 ] choice* repeat1 "\"" "\"" surrounded-by ,
120 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
121 ] choice* [ "" flatten-as unescape-string ] action ;
123 : non-terminal-parser ( -- parser )
124 ! A non-terminal is the name of another rule. It can
125 ! be any non-blank character except for characters used
126 ! in the EBNF syntax itself.
130 [ "\"'|{}=)(][.!&*+?:~<>" member? ]
132 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
134 : terminal-parser ( -- parser )
135 ! A terminal is an identifier enclosed in quotations
136 ! and it represents the literal value of the identifier.
137 identifier-parser [ <ebnf-terminal> ] action ;
139 : foreign-name-parser ( -- parser )
140 ! Parse a valid foreign parser name
146 ] satisfy repeat1 [ >string ] action ;
148 : foreign-parser ( -- parser )
149 ! A foreign call is a call to a rule in another ebnf grammar
152 foreign-name-parser sp ,
153 foreign-name-parser sp optional ,
155 ] seq* [ first2 <ebnf-foreign> ] action ;
157 : any-character-parser ( -- parser )
158 ! A parser to match the symbol for any character match.
159 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
161 : range-parser ( -- parser )
162 ! Match the syntax for declaring character ranges
164 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
166 "\\]" token [ second ] action ,
167 [ CHAR: ] = not ] satisfy ,
170 ] seq* [ first >string unescape-string <ebnf-range> ] action ;
172 : (element-parser) ( -- parser )
173 ! An element of a rule. It can be a terminal or a
174 ! non-terminal but must not be followed by a "=".
175 ! The latter indicates that it is the beginning of a
180 non-terminal-parser ,
184 any-character-parser ,
186 [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
187 [ dup , "*~" token hide , ] seq* [ first <ebnf-repeat0> <ebnf-ignore> ] action ,
188 [ dup , "+~" token hide , ] seq* [ first <ebnf-repeat1> <ebnf-ignore> ] action ,
189 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
190 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
191 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
195 "=" syntax ensure-not ,
198 ] seq* [ first ] action ;
202 : element-parser ( -- parser )
205 (element-parser) , ":" syntax ,
206 "a-zA-Z_" range-pattern
207 "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
208 ] seq* [ first2 <ebnf-var> ] action ,
214 : grouped ( quot suffix -- parser )
215 ! Parse a group of choices, with a suffix indicating
216 ! the type of group (repeat0, repeat1, etc) and
217 ! an quot that is the action that produces the AST.
220 "(" [ choice-parser sp ] delay ")" syntax-pack
222 [ first ] rot compose action ,
223 "{" [ choice-parser sp ] delay "}" syntax-pack
225 [ first <ebnf-whitespace> ] rot compose action ,
228 : group-parser ( -- parser )
229 ! A grouping with no suffix. Used for precedence.
231 "~" token sp ensure-not ,
232 "*" token sp ensure-not ,
233 "+" token sp ensure-not ,
234 "?" token sp ensure-not ,
235 ] seq* hide grouped ;
237 : repeat0-parser ( -- parser )
238 [ <ebnf-repeat0> ] "*" syntax grouped ;
240 : repeat1-parser ( -- parser )
241 [ <ebnf-repeat1> ] "+" syntax grouped ;
243 : ignore-parser ( -- parser )
244 [ <ebnf-ignore> ] "~" syntax grouped ;
246 : ignore-repeat0-parser ( -- parser )
247 [ <ebnf-repeat0> <ebnf-ignore> ] "*~" syntax grouped ;
249 : ignore-repeat1-parser ( -- parser )
250 [ <ebnf-repeat1> <ebnf-ignore> ] "+~" syntax grouped ;
252 : optional-parser ( -- parser )
253 [ <ebnf-optional> ] "?" syntax grouped ;
255 : factor-code-parser ( -- parser )
257 "]]" token ensure-not ,
258 "]?" token ensure-not ,
260 ] seq* repeat0 [ "" concat-as ] action ;
262 : ensure-not-parser ( -- parser )
263 ! Parses the '!' syntax to ensure that
264 ! something that matches the following elements do
265 ! not exist in the parse stream.
269 ] seq* [ first <ebnf-ensure-not> ] action ;
271 : ensure-parser ( -- parser )
272 ! Parses the '&' syntax to ensure that
273 ! something that matches the following elements does
274 ! exist in the parse stream.
278 ] seq* [ first <ebnf-ensure> ] action ;
280 : (sequence-parser) ( -- parser )
281 ! A sequence of terminals and non-terminals, including
282 ! groupings of those.
285 ensure-not-parser sp ,
290 ignore-repeat0-parser sp ,
291 ignore-repeat1-parser sp ,
296 dup , ":" syntax , "a-zA-Z" range-pattern repeat1
298 ] seq* [ first2 <ebnf-var> ] action ,
302 : action-parser ( -- parser )
303 "[[" factor-code-parser "]]" syntax-pack ;
305 : semantic-parser ( -- parser )
306 "?[" factor-code-parser "]?" syntax-pack ;
308 : sequence-parser ( -- parser )
309 ! A sequence of terminals and non-terminals, including
310 ! groupings of those.
312 [ (sequence-parser) , action-parser , ] seq*
313 [ first2 <ebnf-action> ] action ,
315 [ (sequence-parser) , semantic-parser , ] seq*
316 [ first2 <ebnf-semantic> ] action ,
320 dup length 1 = [ first ] [ <ebnf-sequence> ] if
323 : actioned-sequence-parser ( -- parser )
325 [ sequence-parser , "=>" syntax , action-parser , ] seq*
326 [ first2 <ebnf-action> ] action ,
330 : choice-parser ( -- parser )
331 actioned-sequence-parser sp repeat1 [
332 dup length 1 = [ first ] [ <ebnf-sequence> ] if
333 ] action "|" token sp list-of [
334 dup length 1 = [ first ] [ <ebnf-choice> ] if
337 : tokenizer-parser ( -- parser )
341 ">" token ensure-not ,
342 [ "default" token sp , choice-parser , ] choice* ,
343 ] seq* [ first <ebnf-tokenizer> ] action ;
345 : rule-parser ( -- parser )
347 "tokenizer" token ensure-not ,
348 non-terminal-parser [ symbol>> ] action ,
350 ">" token ensure-not ,
352 ] seq* [ first2 <ebnf-rule> ] action ;
354 : ebnf-parser ( -- parser )
355 [ tokenizer-parser sp , rule-parser sp , ] choice* repeat1 [ <ebnf> ] action ;
357 GENERIC: (transform) ( ast -- parser )
363 : transform ( ast -- object )
372 rules>> [ (transform) ] map last ;
374 M: ebnf-tokenizer (transform)
375 elements>> dup "default" = [
376 drop default-tokenizer \ tokenizer set-global any-char
379 dup parser-tokenizer \ tokenizer set-global
382 ERROR: redefined-rule name ;
384 M: redefined-rule summary
385 name>> "Rule '" "' defined more than once" surround ;
387 M: ebnf-rule (transform)
390 swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
393 M: ebnf-sequence (transform)
394 ! If ignore-ws is set then each element of the sequence
395 ! ignores leading whitespace. This is not inherited by
396 ! subelements of the sequence.
398 f ignore-ws [ (transform) ] with-variable
399 ignore-ws get [ sp ] when
400 ] map seq [ dup length 1 = [ first ] when ] action ;
402 M: ebnf-choice (transform)
403 options>> [ (transform) ] map choice ;
405 M: ebnf-any-character (transform)
406 drop tokenizer any>> call( -- parser ) ;
408 M: ebnf-range (transform)
409 pattern>> range-pattern ;
411 : transform-group ( ast -- parser )
412 ! convert a ast node with groups to a parser for that group
413 group>> (transform) ;
415 M: ebnf-ensure (transform)
416 transform-group ensure ;
418 M: ebnf-ensure-not (transform)
419 transform-group ensure-not ;
421 M: ebnf-ignore (transform)
422 transform-group [ drop ignore ] action ;
424 M: ebnf-repeat0 (transform)
425 transform-group repeat0 ;
427 M: ebnf-repeat1 (transform)
428 transform-group repeat1 ;
430 M: ebnf-optional (transform)
431 transform-group optional ;
433 M: ebnf-whitespace (transform)
434 t ignore-ws [ transform-group ] with-variable ;
436 GENERIC: build-locals ( code ast -- code )
438 M: ebnf-sequence build-locals
439 ! Note the need to filter out this ebnf items that
440 ! leave nothing in the AST
441 elements>> filter-hidden dup length 1 = [
444 dup [ ebnf-var? ] none? [
451 " " % # " over nth :> " %
464 M: ebnf-var build-locals
466 "[let dup :> " % name>> %
472 M: ebnf-whitespace build-locals
473 group>> build-locals ;
475 M: object build-locals
478 ERROR: bad-effect quot effect ;
480 : check-action-effect ( quot -- quot )
482 { [ dup ( a -- b ) effect<= ] [ drop ] }
483 { [ dup ( -- b ) effect<= ] [ drop [ drop ] prepose ] }
487 : ebnf-transform ( ast -- parser quot )
488 [ parser>> (transform) ]
489 [ code>> insert-escapes ]
490 [ parser>> ] tri build-locals
492 { "dup" dup } { "nip" nip } { "over" over } ! kernel
493 { "nth" nth } ! sequences
494 } [ split-lines parse-lines ] with-words ;
496 M: ebnf-action (transform)
497 ebnf-transform check-action-effect action ;
499 M: ebnf-semantic (transform)
500 ebnf-transform semantic ;
502 M: ebnf-var (transform)
503 parser>> (transform) ;
505 M: ebnf-terminal (transform)
506 symbol>> tokenizer one>> call( symbol -- parser ) ;
508 ERROR: ebnf-foreign-not-found name ;
510 M: ebnf-foreign-not-found summary
511 name>> "Foreign word '" "' not found" surround ;
513 M: ebnf-foreign (transform)
514 dup word>> search [ word>> ebnf-foreign-not-found ] unless*
515 swap rule>> [ main ] unless* over rule [
521 ERROR: parser-not-found name ;
523 M: ebnf-non-terminal (transform)
525 '[ _ dup _ at [ parser-not-found ] unless* nip ] box ;
527 : transform-ebnf ( string -- object )
528 ebnf-parser parse transform ;
530 ERROR: unable-to-fully-parse-ebnf remaining ;
532 ERROR: could-not-parse-ebnf ;
534 : check-parse-result ( result -- result )
536 dup remaining>> [ blank? ] trim [
537 unable-to-fully-parse-ebnf
543 : parse-ebnf ( string -- hashtable )
544 ebnf-parser (parse) check-parse-result ast>> transform ;
546 : ebnf>quot ( string -- hashtable quot: ( string -- results ) )
547 parse-ebnf dup dup parser [ main of compile ] with-variable
548 '[ [ _ compiled-parse ] with-scope ] ;
554 scan-new-word dup scan-object
556 [ "ebnf-quot" set-word-prop ] 2keep
557 [ check-parse-result ast>> ] compose
558 ( input -- ast ) define-declared
559 "ebnf-parser" set-word-prop ;
561 SYNTAX: PARTIAL-EBNF:
563 scan-word "ebnf-quot" word-prop
565 ( input -- ast ) define-declared ;
567 : define-inline-ebnf ( ast string -- quot )
569 ebnf>quot [ check-parse-result ast>> ] compose nip
570 suffix! \ call suffix! reset-tokenizer ;
572 : define-partial-inline-ebnf ( ast string -- quot )
574 ebnf>quot [ ast>> ] compose nip
575 suffix! \ call suffix! reset-tokenizer ;
577 SYNTAX: EBNF[[ "]]" parse-multiline-string define-inline-ebnf ;
578 SYNTAX: EBNF[=[ "]=]" parse-multiline-string define-inline-ebnf ;
579 SYNTAX: EBNF[==[ "]==]" parse-multiline-string define-inline-ebnf ;
580 SYNTAX: EBNF[===[ "]===]" parse-multiline-string define-inline-ebnf ;
581 SYNTAX: EBNF[====[ "]====]" parse-multiline-string define-inline-ebnf ;
583 SYNTAX: PARTIAL-EBNF[[ "]]" parse-multiline-string define-partial-inline-ebnf ;
584 SYNTAX: PARTIAL-EBNF[=[ "]=]" parse-multiline-string define-partial-inline-ebnf ;
585 SYNTAX: PARTIAL-EBNF[==[ "]==]" parse-multiline-string define-partial-inline-ebnf ;
586 SYNTAX: PARTIAL-EBNF[===[ "]===]" parse-multiline-string define-partial-inline-ebnf ;
587 SYNTAX: PARTIAL-EBNF[====[ "]====]" parse-multiline-string define-partial-inline-ebnf ;
592 scan-object parse-ebnf main of '[ _ ]
593 ( -- parser ) define-declared