1 ! Copyright (C) 2007 Chris Double.
\r
2 ! See http://factorcode.org/license.txt for BSD license.
\r
3 USING: kernel words arrays strings math.parser
\r
4 sequences quotations vectors namespaces make math assocs
\r
5 continuations peg peg.parsers unicode.categories multiline
\r
6 splitting accessors effects sequences.deep peg.search
\r
7 combinators.short-circuit lexer io.streams.string stack-checker
\r
8 io combinators parser summary ;
\r
9 FROM: compiler.units => with-compilation-unit ;
\r
10 FROM: vocabs.parser => search ;
\r
13 : rule ( name word -- parser )
\r
14 #! Given an EBNF word produced from EBNF: return the EBNF rule
\r
15 "ebnf-parser" word-prop at ;
\r
17 ERROR: no-rule rule parser ;
\r
21 : lookup-rule ( rule parser -- rule' )
\r
22 2dup rule [ 2nip ] [ no-rule ] if* ;
\r
24 TUPLE: tokenizer any one many ;
\r
26 : default-tokenizer ( -- tokenizer )
\r
30 [ [ = ] curry any-char swap semantic ]
\r
33 : parser-tokenizer ( parser -- tokenizer )
\r
35 [ swap [ = ] curry semantic ] curry dup \ tokenizer boa ;
\r
37 : rule-tokenizer ( name word -- tokenizer )
\r
38 rule parser-tokenizer ;
\r
40 : tokenizer ( -- word )
\r
41 \ tokenizer get-global [ default-tokenizer ] unless* ;
\r
43 : reset-tokenizer ( -- )
\r
44 default-tokenizer \ tokenizer set-global ;
\r
46 ERROR: no-tokenizer name ;
\r
48 M: no-tokenizer summary
\r
49 drop "Tokenizer not found" ;
\r
52 scan dup search [ nip ] [ no-tokenizer ] if*
\r
53 execute( -- tokenizer ) \ tokenizer set-global ;
\r
55 TUPLE: ebnf-non-terminal symbol ;
\r
56 TUPLE: ebnf-terminal symbol ;
\r
57 TUPLE: ebnf-foreign word rule ;
\r
58 TUPLE: ebnf-any-character ;
\r
59 TUPLE: ebnf-range pattern ;
\r
60 TUPLE: ebnf-ensure group ;
\r
61 TUPLE: ebnf-ensure-not group ;
\r
62 TUPLE: ebnf-choice options ;
\r
63 TUPLE: ebnf-sequence elements ;
\r
64 TUPLE: ebnf-ignore group ;
\r
65 TUPLE: ebnf-repeat0 group ;
\r
66 TUPLE: ebnf-repeat1 group ;
\r
67 TUPLE: ebnf-optional group ;
\r
68 TUPLE: ebnf-whitespace group ;
\r
69 TUPLE: ebnf-tokenizer elements ;
\r
70 TUPLE: ebnf-rule symbol elements ;
\r
71 TUPLE: ebnf-action parser code ;
\r
72 TUPLE: ebnf-var parser name ;
\r
73 TUPLE: ebnf-semantic parser code ;
\r
76 C: <ebnf-non-terminal> ebnf-non-terminal
\r
77 C: <ebnf-terminal> ebnf-terminal
\r
78 C: <ebnf-foreign> ebnf-foreign
\r
79 C: <ebnf-any-character> ebnf-any-character
\r
80 C: <ebnf-range> ebnf-range
\r
81 C: <ebnf-ensure> ebnf-ensure
\r
82 C: <ebnf-ensure-not> ebnf-ensure-not
\r
83 C: <ebnf-choice> ebnf-choice
\r
84 C: <ebnf-sequence> ebnf-sequence
\r
85 C: <ebnf-ignore> ebnf-ignore
\r
86 C: <ebnf-repeat0> ebnf-repeat0
\r
87 C: <ebnf-repeat1> ebnf-repeat1
\r
88 C: <ebnf-optional> ebnf-optional
\r
89 C: <ebnf-whitespace> ebnf-whitespace
\r
90 C: <ebnf-tokenizer> ebnf-tokenizer
\r
91 C: <ebnf-rule> ebnf-rule
\r
92 C: <ebnf-action> ebnf-action
\r
93 C: <ebnf-var> ebnf-var
\r
94 C: <ebnf-semantic> ebnf-semantic
\r
97 : filter-hidden ( seq -- seq )
\r
98 #! Remove elements that produce no AST from sequence
\r
99 [ ebnf-ensure-not? not ] filter [ ebnf-ensure? not ] filter ;
\r
101 : syntax ( string -- parser )
\r
102 #! Parses the string, ignoring white space, and
\r
103 #! does not put the result in the AST.
\r
106 : syntax-pack ( begin parser end -- parser )
\r
107 #! Parse 'parser' surrounded by syntax elements
\r
109 [ syntax ] 2dip syntax pack ;
\r
111 #! Don't want to use 'replace' in an action since replace doesn't infer.
\r
112 #! Do the compilation of the peg at parse time and call (replace).
\r
113 PEG: escaper ( string -- ast )
\r
115 "\\t" token [ drop "\t" ] action ,
\r
116 "\\n" token [ drop "\n" ] action ,
\r
117 "\\r" token [ drop "\r" ] action ,
\r
118 "\\\\" token [ drop "\\" ] action ,
\r
119 ] choice* any-char-parser 2array choice repeat0 ;
\r
121 : replace-escapes ( string -- string )
\r
122 escaper sift [ [ tree-write ] each ] with-string-writer ;
\r
124 : insert-escapes ( string -- string )
\r
126 "\t" token [ drop "\\t" ] action ,
\r
127 "\n" token [ drop "\\n" ] action ,
\r
128 "\r" token [ drop "\\r" ] action ,
\r
129 ] choice* replace ;
\r
131 : 'identifier' ( -- parser )
\r
132 #! Return a parser that parses an identifer delimited by
\r
133 #! a quotation character. The quotation can be single
\r
134 #! or double quotes. The AST produced is the identifier
\r
135 #! between the quotes.
\r
137 [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by ,
\r
138 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
\r
139 ] choice* [ >string replace-escapes ] action ;
\r
141 : 'non-terminal' ( -- parser )
\r
142 #! A non-terminal is the name of another rule. It can
\r
143 #! be any non-blank character except for characters used
\r
144 #! in the EBNF syntax itself.
\r
169 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
\r
171 : 'terminal' ( -- parser )
\r
172 #! A terminal is an identifier enclosed in quotations
\r
173 #! and it represents the literal value of the identifier.
\r
174 'identifier' [ <ebnf-terminal> ] action ;
\r
176 : 'foreign-name' ( -- parser )
\r
177 #! Parse a valid foreign parser name
\r
183 ] satisfy repeat1 [ >string ] action ;
\r
185 : 'foreign' ( -- parser )
\r
186 #! A foreign call is a call to a rule in another ebnf grammar
\r
188 "<foreign" syntax ,
\r
189 'foreign-name' sp ,
\r
190 'foreign-name' sp optional ,
\r
192 ] seq* [ first2 <ebnf-foreign> ] action ;
\r
194 : 'any-character' ( -- parser )
\r
195 #! A parser to match the symbol for any character match.
\r
196 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
\r
198 : 'range-parser' ( -- parser )
\r
199 #! Match the syntax for declaring character ranges
\r
201 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
\r
202 [ CHAR: ] = not ] satisfy repeat1 ,
\r
204 ] seq* [ first >string <ebnf-range> ] action ;
\r
206 : ('element') ( -- parser )
\r
207 #! An element of a rule. It can be a terminal or a
\r
208 #! non-terminal but must not be followed by a "=".
\r
209 #! The latter indicates that it is the beginning of a
\r
220 [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
\r
221 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
\r
222 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
\r
223 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
\r
227 "=" syntax ensure-not ,
\r
228 "=>" syntax ensure ,
\r
230 ] seq* [ first ] action ;
\r
234 : 'element' ( -- parser )
\r
237 ('element') , ":" syntax ,
\r
238 "a-zA-Z_" range-pattern
\r
239 "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
\r
240 ] seq* [ first2 <ebnf-var> ] action ,
\r
246 : grouped ( quot suffix -- parser )
\r
247 #! Parse a group of choices, with a suffix indicating
\r
248 #! the type of group (repeat0, repeat1, etc) and
\r
249 #! an quot that is the action that produces the AST.
\r
252 "(" [ 'choice' sp ] delay ")" syntax-pack
\r
254 [ first ] rot compose action ,
\r
255 "{" [ 'choice' sp ] delay "}" syntax-pack
\r
257 [ first <ebnf-whitespace> ] rot compose action ,
\r
260 : 'group' ( -- parser )
\r
261 #! A grouping with no suffix. Used for precedence.
\r
263 "~" token sp ensure-not ,
\r
264 "*" token sp ensure-not ,
\r
265 "+" token sp ensure-not ,
\r
266 "?" token sp ensure-not ,
\r
267 ] seq* hide grouped ;
\r
269 : 'ignore' ( -- parser )
\r
270 [ <ebnf-ignore> ] "~" syntax grouped ;
\r
272 : 'repeat0' ( -- parser )
\r
273 [ <ebnf-repeat0> ] "*" syntax grouped ;
\r
275 : 'repeat1' ( -- parser )
\r
276 [ <ebnf-repeat1> ] "+" syntax grouped ;
\r
278 : 'optional' ( -- parser )
\r
279 [ <ebnf-optional> ] "?" syntax grouped ;
\r
281 : 'factor-code' ( -- parser )
\r
283 "]]" token ensure-not ,
\r
284 "]?" token ensure-not ,
\r
285 [ drop t ] satisfy ,
\r
286 ] seq* repeat0 [ concat >string ] action ;
\r
288 : 'ensure-not' ( -- parser )
\r
289 #! Parses the '!' syntax to ensure that
\r
290 #! something that matches the following elements do
\r
291 #! not exist in the parse stream.
\r
295 ] seq* [ first <ebnf-ensure-not> ] action ;
\r
297 : 'ensure' ( -- parser )
\r
298 #! Parses the '&' syntax to ensure that
\r
299 #! something that matches the following elements does
\r
300 #! exist in the parse stream.
\r
304 ] seq* [ first <ebnf-ensure> ] action ;
\r
306 : ('sequence') ( -- parser )
\r
307 #! A sequence of terminals and non-terminals, including
\r
308 #! groupings of those.
\r
320 [ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
\r
324 : 'action' ( -- parser )
\r
325 "[[" 'factor-code' "]]" syntax-pack ;
\r
327 : 'semantic' ( -- parser )
\r
328 "?[" 'factor-code' "]?" syntax-pack ;
\r
330 : 'sequence' ( -- parser )
\r
331 #! A sequence of terminals and non-terminals, including
\r
332 #! groupings of those.
\r
334 [ ('sequence') , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
\r
335 [ ('sequence') , 'semantic' , ] seq* [ first2 <ebnf-semantic> ] action ,
\r
337 ] choice* repeat1 [
\r
338 dup length 1 = [ first ] [ <ebnf-sequence> ] if
\r
341 : 'actioned-sequence' ( -- parser )
\r
343 [ 'sequence' , "=>" syntax , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
\r
347 : 'choice' ( -- parser )
\r
348 'actioned-sequence' sp repeat1 [ dup length 1 = [ first ] [ <ebnf-sequence> ] if ] action "|" token sp list-of [
\r
349 dup length 1 = [ first ] [ <ebnf-choice> ] if
\r
352 : 'tokenizer' ( -- parser )
\r
354 "tokenizer" syntax ,
\r
356 ">" token ensure-not ,
\r
357 [ "default" token sp , 'choice' , ] choice* ,
\r
358 ] seq* [ first <ebnf-tokenizer> ] action ;
\r
360 : 'rule' ( -- parser )
\r
362 "tokenizer" token ensure-not ,
\r
363 'non-terminal' [ symbol>> ] action ,
\r
365 ">" token ensure-not ,
\r
367 ] seq* [ first2 <ebnf-rule> ] action ;
\r
369 : 'ebnf' ( -- parser )
\r
370 [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;
\r
372 GENERIC: (transform) ( ast -- parser )
\r
378 : transform ( ast -- object )
\r
379 H{ } clone dup dup [
\r
386 M: ebnf (transform) ( ast -- parser )
\r
387 rules>> [ (transform) ] map last ;
\r
389 M: ebnf-tokenizer (transform) ( ast -- parser )
\r
390 elements>> dup "default" = [
\r
391 drop default-tokenizer \ tokenizer set-global any-char
\r
394 dup parser-tokenizer \ tokenizer set-global
\r
397 ERROR: redefined-rule name ;
\r
399 M: redefined-rule summary
\r
400 name>> "Rule '" "' defined more than once" surround ;
\r
402 M: ebnf-rule (transform) ( ast -- parser )
\r
405 swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
\r
408 M: ebnf-sequence (transform) ( ast -- parser )
\r
409 #! If ignore-ws is set then each element of the sequence
\r
410 #! ignores leading whitespace. This is not inherited by
\r
411 #! subelements of the sequence.
\r
413 f ignore-ws [ (transform) ] with-variable
\r
414 ignore-ws get [ sp ] when
\r
415 ] map seq [ dup length 1 = [ first ] when ] action ;
\r
417 M: ebnf-choice (transform) ( ast -- parser )
\r
418 options>> [ (transform) ] map choice ;
\r
420 M: ebnf-any-character (transform) ( ast -- parser )
\r
421 drop tokenizer any>> call( -- parser ) ;
\r
423 M: ebnf-range (transform) ( ast -- parser )
\r
424 pattern>> range-pattern ;
\r
426 : transform-group ( ast -- parser )
\r
427 #! convert a ast node with groups to a parser for that group
\r
428 group>> (transform) ;
\r
430 M: ebnf-ensure (transform) ( ast -- parser )
\r
431 transform-group ensure ;
\r
433 M: ebnf-ensure-not (transform) ( ast -- parser )
\r
434 transform-group ensure-not ;
\r
436 M: ebnf-ignore (transform) ( ast -- parser )
\r
437 transform-group [ drop ignore ] action ;
\r
439 M: ebnf-repeat0 (transform) ( ast -- parser )
\r
440 transform-group repeat0 ;
\r
442 M: ebnf-repeat1 (transform) ( ast -- parser )
\r
443 transform-group repeat1 ;
\r
445 M: ebnf-optional (transform) ( ast -- parser )
\r
446 transform-group optional ;
\r
448 M: ebnf-whitespace (transform) ( ast -- parser )
\r
449 t ignore-ws [ transform-group ] with-variable ;
\r
451 GENERIC: build-locals ( code ast -- code )
\r
453 M: ebnf-sequence build-locals ( code ast -- code )
\r
454 #! Note the need to filter out this ebnf items that
\r
455 #! leave nothing in the AST
\r
456 elements>> filter-hidden dup length 1 = [
\r
457 first build-locals
\r
459 dup [ ebnf-var? ] filter empty? [
\r
463 "FROM: locals => [let :> ; FROM: sequences => nth ; FROM: kernel => nip over ; [let " %
\r
466 " " % # " over nth :> " %
\r
479 M: ebnf-var build-locals ( code ast -- )
\r
481 "FROM: locals => [let :> ; FROM: kernel => dup nip ; [let " %
\r
482 " dup :> " % name>> %
\r
488 M: object build-locals ( code ast -- )
\r
491 ERROR: bad-effect quot effect ;
\r
493 : check-action-effect ( quot -- quot )
\r
495 { [ dup (( a -- b )) effect<= ] [ drop ] }
\r
496 { [ dup (( -- b )) effect<= ] [ drop [ drop ] prepose ] }
\r
500 : ebnf-transform ( ast -- parser quot )
\r
501 [ parser>> (transform) ]
\r
502 [ code>> insert-escapes ]
\r
503 [ parser>> ] tri build-locals
\r
504 [ string-lines parse-lines ] call( string -- quot ) ;
\r
506 M: ebnf-action (transform) ( ast -- parser )
\r
507 ebnf-transform check-action-effect action ;
\r
509 M: ebnf-semantic (transform) ( ast -- parser )
\r
510 ebnf-transform semantic ;
\r
512 M: ebnf-var (transform) ( ast -- parser )
\r
513 parser>> (transform) ;
\r
515 M: ebnf-terminal (transform) ( ast -- parser )
\r
516 symbol>> tokenizer one>> call( symbol -- parser ) ;
\r
518 ERROR: ebnf-foreign-not-found name ;
\r
520 M: ebnf-foreign-not-found summary
\r
521 name>> "Foreign word '" "' not found" surround ;
\r
523 M: ebnf-foreign (transform) ( ast -- parser )
\r
524 dup word>> search [ word>> ebnf-foreign-not-found ] unless*
\r
525 swap rule>> [ main ] unless* over rule [
\r
528 execute( -- parser )
\r
531 ERROR: parser-not-found name ;
\r
533 M: ebnf-non-terminal (transform) ( ast -- parser )
\r
535 , \ dup , parser get , \ at , [ parser-not-found ] , \ unless* , \ nip ,
\r
538 : transform-ebnf ( string -- object )
\r
539 'ebnf' parse transform ;
\r
541 : check-parse-result ( result -- result )
\r
543 dup remaining>> [ blank? ] trim [
\r
545 "Unable to fully parse EBNF. Left to parse was: " %
\r
550 "Could not parse EBNF" throw
\r
553 : parse-ebnf ( string -- hashtable )
\r
554 'ebnf' (parse) check-parse-result ast>> transform ;
\r
556 : ebnf>quot ( string -- hashtable quot )
\r
557 parse-ebnf dup dup parser [ main swap at compile ] with-variable
\r
558 [ compiled-parse ] curry [ with-scope ast>> ] curry ;
\r
564 reset-tokenizer parse-multiline-string parse-ebnf main swap at
\r
565 suffix! reset-tokenizer ;
\r
569 reset-tokenizer parse-multiline-string ebnf>quot nip
\r
570 suffix! \ call suffix! reset-tokenizer ;
\r
573 reset-tokenizer CREATE-WORD dup ";EBNF" parse-multiline-string
\r
575 (( input -- ast )) define-declared "ebnf-parser" set-word-prop
\r