1 ! Copyright (C) 2007 Chris Double.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators
4 combinators.short-circuit effects io.streams.string kernel make
5 math.parser multiline namespaces parser peg peg.parsers
6 peg.search quotations sequences splitting stack-checker strings
7 summary unicode.categories words ;
8 FROM: compiler.units => with-compilation-unit ;
9 FROM: strings.parser => unescape-string ;
10 FROM: vocabs.parser => search ;
11 FROM: peg.search => replace ;
14 : rule ( name word -- parser )
15 #! Given an EBNF word produced from EBNF: return the EBNF rule
16 "ebnf-parser" word-prop at ;
18 ERROR: no-rule rule parser ;
22 : lookup-rule ( rule parser -- rule' )
23 2dup rule [ 2nip ] [ no-rule ] if* ;
25 TUPLE: tokenizer-tuple any one many ;
27 : default-tokenizer ( -- tokenizer )
31 [ [ = ] curry any-char swap semantic ]
34 : parser-tokenizer ( parser -- tokenizer )
36 [ swap [ = ] curry semantic ] curry dup \ tokenizer-tuple boa ;
38 : rule-tokenizer ( name word -- tokenizer )
39 rule parser-tokenizer ;
41 : tokenizer ( -- word )
42 \ tokenizer get-global [ default-tokenizer ] unless* ;
44 : reset-tokenizer ( -- )
45 default-tokenizer \ tokenizer set-global ;
47 ERROR: no-tokenizer name ;
49 M: no-tokenizer summary
50 drop "Tokenizer not found" ;
53 scan-word-name dup search [ nip ] [ no-tokenizer ] if*
54 execute( -- tokenizer ) \ tokenizer set-global ;
56 TUPLE: ebnf-non-terminal symbol ;
57 TUPLE: ebnf-terminal symbol ;
58 TUPLE: ebnf-foreign word rule ;
59 TUPLE: ebnf-any-character ;
60 TUPLE: ebnf-range pattern ;
61 TUPLE: ebnf-ensure group ;
62 TUPLE: ebnf-ensure-not group ;
63 TUPLE: ebnf-choice options ;
64 TUPLE: ebnf-sequence elements ;
65 TUPLE: ebnf-ignore group ;
66 TUPLE: ebnf-repeat0 group ;
67 TUPLE: ebnf-repeat1 group ;
68 TUPLE: ebnf-optional group ;
69 TUPLE: ebnf-whitespace group ;
70 TUPLE: ebnf-tokenizer elements ;
71 TUPLE: ebnf-rule symbol elements ;
72 TUPLE: ebnf-action parser code ;
73 TUPLE: ebnf-var parser name ;
74 TUPLE: ebnf-semantic parser code ;
77 C: <ebnf-non-terminal> ebnf-non-terminal
78 C: <ebnf-terminal> ebnf-terminal
79 C: <ebnf-foreign> ebnf-foreign
80 C: <ebnf-any-character> ebnf-any-character
81 C: <ebnf-range> ebnf-range
82 C: <ebnf-ensure> ebnf-ensure
83 C: <ebnf-ensure-not> ebnf-ensure-not
84 C: <ebnf-choice> ebnf-choice
85 C: <ebnf-sequence> ebnf-sequence
86 C: <ebnf-ignore> ebnf-ignore
87 C: <ebnf-repeat0> ebnf-repeat0
88 C: <ebnf-repeat1> ebnf-repeat1
89 C: <ebnf-optional> ebnf-optional
90 C: <ebnf-whitespace> ebnf-whitespace
91 C: <ebnf-tokenizer> ebnf-tokenizer
92 C: <ebnf-rule> ebnf-rule
93 C: <ebnf-action> ebnf-action
94 C: <ebnf-var> ebnf-var
95 C: <ebnf-semantic> ebnf-semantic
98 : filter-hidden ( seq -- seq )
99 #! Remove elements that produce no AST from sequence
100 [ ebnf-ensure-not? ] reject [ ebnf-ensure? not ] filter ;
102 : syntax ( string -- parser )
103 #! Parses the string, ignoring white space, and
104 #! does not put the result in the AST.
107 : syntax-pack ( begin parser end -- parser )
108 #! Parse 'parser' surrounded by syntax elements
110 [ syntax ] 2dip syntax pack ;
112 : insert-escapes ( string -- string )
114 "\t" token [ drop "\\t" ] action ,
115 "\n" token [ drop "\\n" ] action ,
116 "\r" token [ drop "\\r" ] action ,
119 : 'identifier' ( -- parser )
120 #! Return a parser that parses an identifer delimited by
121 #! a quotation character. The quotation can be single
122 #! or double quotes. The AST produced is the identifier
123 #! between the quotes.
125 [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by ,
126 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
127 ] choice* [ >string unescape-string ] action ;
129 : 'non-terminal' ( -- parser )
130 #! A non-terminal is the name of another rule. It can
131 #! be any non-blank character except for characters used
132 #! in the EBNF syntax itself.
157 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
159 : 'terminal' ( -- parser )
160 #! A terminal is an identifier enclosed in quotations
161 #! and it represents the literal value of the identifier.
162 'identifier' [ <ebnf-terminal> ] action ;
164 : 'foreign-name' ( -- parser )
165 #! Parse a valid foreign parser name
171 ] satisfy repeat1 [ >string ] action ;
173 : 'foreign' ( -- parser )
174 #! A foreign call is a call to a rule in another ebnf grammar
178 'foreign-name' sp optional ,
180 ] seq* [ first2 <ebnf-foreign> ] action ;
182 : 'any-character' ( -- parser )
183 #! A parser to match the symbol for any character match.
184 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
186 : 'range-parser' ( -- parser )
187 #! Match the syntax for declaring character ranges
189 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
190 [ CHAR: ] = not ] satisfy repeat1 ,
192 ] seq* [ first >string unescape-string <ebnf-range> ] action ;
194 : ('element') ( -- parser )
195 #! An element of a rule. It can be a terminal or a
196 #! non-terminal but must not be followed by a "=".
197 #! The latter indicates that it is the beginning of a
208 [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
209 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
210 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
211 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
215 "=" syntax ensure-not ,
218 ] seq* [ first ] action ;
222 : 'element' ( -- parser )
225 ('element') , ":" syntax ,
226 "a-zA-Z_" range-pattern
227 "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
228 ] seq* [ first2 <ebnf-var> ] action ,
234 : grouped ( quot suffix -- parser )
235 #! Parse a group of choices, with a suffix indicating
236 #! the type of group (repeat0, repeat1, etc) and
237 #! an quot that is the action that produces the AST.
240 "(" [ 'choice' sp ] delay ")" syntax-pack
242 [ first ] rot compose action ,
243 "{" [ 'choice' sp ] delay "}" syntax-pack
245 [ first <ebnf-whitespace> ] rot compose action ,
248 : 'group' ( -- parser )
249 #! A grouping with no suffix. Used for precedence.
251 "~" token sp ensure-not ,
252 "*" token sp ensure-not ,
253 "+" token sp ensure-not ,
254 "?" token sp ensure-not ,
255 ] seq* hide grouped ;
257 : 'ignore' ( -- parser )
258 [ <ebnf-ignore> ] "~" syntax grouped ;
260 : 'repeat0' ( -- parser )
261 [ <ebnf-repeat0> ] "*" syntax grouped ;
263 : 'repeat1' ( -- parser )
264 [ <ebnf-repeat1> ] "+" syntax grouped ;
266 : 'optional' ( -- parser )
267 [ <ebnf-optional> ] "?" syntax grouped ;
269 : 'factor-code' ( -- parser )
271 "]]" token ensure-not ,
272 "]?" token ensure-not ,
274 ] seq* repeat0 [ "" concat-as ] action ;
276 : 'ensure-not' ( -- parser )
277 #! Parses the '!' syntax to ensure that
278 #! something that matches the following elements do
279 #! not exist in the parse stream.
283 ] seq* [ first <ebnf-ensure-not> ] action ;
285 : 'ensure' ( -- parser )
286 #! Parses the '&' syntax to ensure that
287 #! something that matches the following elements does
288 #! exist in the parse stream.
292 ] seq* [ first <ebnf-ensure> ] action ;
294 : ('sequence') ( -- parser )
295 #! A sequence of terminals and non-terminals, including
296 #! groupings of those.
308 [ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
312 : 'action' ( -- parser )
313 "[[" 'factor-code' "]]" syntax-pack ;
315 : 'semantic' ( -- parser )
316 "?[" 'factor-code' "]?" syntax-pack ;
318 : 'sequence' ( -- parser )
319 #! A sequence of terminals and non-terminals, including
320 #! groupings of those.
322 [ ('sequence') , 'action' , ] seq*
323 [ first2 <ebnf-action> ] action ,
325 [ ('sequence') , 'semantic' , ] seq*
326 [ first2 <ebnf-semantic> ] action ,
330 dup length 1 = [ first ] [ <ebnf-sequence> ] if
333 : 'actioned-sequence' ( -- parser )
335 [ 'sequence' , "=>" syntax , 'action' , ] seq*
336 [ first2 <ebnf-action> ] action ,
340 : 'choice' ( -- parser )
341 'actioned-sequence' sp repeat1 [
342 dup length 1 = [ first ] [ <ebnf-sequence> ] if
343 ] action "|" token sp list-of [
344 dup length 1 = [ first ] [ <ebnf-choice> ] if
347 : 'tokenizer' ( -- parser )
351 ">" token ensure-not ,
352 [ "default" token sp , 'choice' , ] choice* ,
353 ] seq* [ first <ebnf-tokenizer> ] action ;
355 : 'rule' ( -- parser )
357 "tokenizer" token ensure-not ,
358 'non-terminal' [ symbol>> ] action ,
360 ">" token ensure-not ,
362 ] seq* [ first2 <ebnf-rule> ] action ;
364 : 'ebnf' ( -- parser )
365 [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;
367 GENERIC: (transform) ( ast -- parser )
373 : transform ( ast -- object )
381 M: ebnf (transform) ( ast -- parser )
382 rules>> [ (transform) ] map last ;
384 M: ebnf-tokenizer (transform) ( ast -- parser )
385 elements>> dup "default" = [
386 drop default-tokenizer \ tokenizer set-global any-char
389 dup parser-tokenizer \ tokenizer set-global
392 ERROR: redefined-rule name ;
394 M: redefined-rule summary
395 name>> "Rule '" "' defined more than once" surround ;
397 M: ebnf-rule (transform) ( ast -- parser )
400 swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
403 M: ebnf-sequence (transform) ( ast -- parser )
404 #! If ignore-ws is set then each element of the sequence
405 #! ignores leading whitespace. This is not inherited by
406 #! subelements of the sequence.
408 f ignore-ws [ (transform) ] with-variable
409 ignore-ws get [ sp ] when
410 ] map seq [ dup length 1 = [ first ] when ] action ;
412 M: ebnf-choice (transform) ( ast -- parser )
413 options>> [ (transform) ] map choice ;
415 M: ebnf-any-character (transform) ( ast -- parser )
416 drop tokenizer any>> call( -- parser ) ;
418 M: ebnf-range (transform) ( ast -- parser )
419 pattern>> range-pattern ;
421 : transform-group ( ast -- parser )
422 #! convert a ast node with groups to a parser for that group
423 group>> (transform) ;
425 M: ebnf-ensure (transform) ( ast -- parser )
426 transform-group ensure ;
428 M: ebnf-ensure-not (transform) ( ast -- parser )
429 transform-group ensure-not ;
431 M: ebnf-ignore (transform) ( ast -- parser )
432 transform-group [ drop ignore ] action ;
434 M: ebnf-repeat0 (transform) ( ast -- parser )
435 transform-group repeat0 ;
437 M: ebnf-repeat1 (transform) ( ast -- parser )
438 transform-group repeat1 ;
440 M: ebnf-optional (transform) ( ast -- parser )
441 transform-group optional ;
443 M: ebnf-whitespace (transform) ( ast -- parser )
444 t ignore-ws [ transform-group ] with-variable ;
446 GENERIC: build-locals ( code ast -- code )
448 M: ebnf-sequence build-locals ( code ast -- code )
449 #! Note the need to filter out this ebnf items that
450 #! leave nothing in the AST
451 elements>> filter-hidden dup length 1 = [
454 dup [ ebnf-var? ] any? not [
458 "FROM: locals => [let :> ; FROM: sequences => nth ; FROM: kernel => nip over ; [let " %
461 " " % # " over nth :> " %
474 M: ebnf-var build-locals ( code ast -- code )
476 "FROM: locals => [let :> ; FROM: kernel => dup nip ; [let " %
477 " dup :> " % name>> %
483 M: object build-locals ( code ast -- code )
486 ERROR: bad-effect quot effect ;
488 : check-action-effect ( quot -- quot )
490 { [ dup ( a -- b ) effect<= ] [ drop ] }
491 { [ dup ( -- b ) effect<= ] [ drop [ drop ] prepose ] }
495 : ebnf-transform ( ast -- parser quot )
496 [ parser>> (transform) ]
497 [ code>> insert-escapes ]
498 [ parser>> ] tri build-locals
499 string-lines parse-lines ;
501 M: ebnf-action (transform) ( ast -- parser )
502 ebnf-transform check-action-effect action ;
504 M: ebnf-semantic (transform) ( ast -- parser )
505 ebnf-transform semantic ;
507 M: ebnf-var (transform) ( ast -- parser )
508 parser>> (transform) ;
510 M: ebnf-terminal (transform) ( ast -- parser )
511 symbol>> tokenizer one>> call( symbol -- parser ) ;
513 ERROR: ebnf-foreign-not-found name ;
515 M: ebnf-foreign-not-found summary
516 name>> "Foreign word '" "' not found" surround ;
518 M: ebnf-foreign (transform) ( ast -- parser )
519 dup word>> search [ word>> ebnf-foreign-not-found ] unless*
520 swap rule>> [ main ] unless* over rule [
526 ERROR: parser-not-found name ;
528 M: ebnf-non-terminal (transform) ( ast -- parser )
530 , \ dup , parser get , \ at ,
531 [ parser-not-found ] , \ unless* , \ nip ,
534 : transform-ebnf ( string -- object )
535 'ebnf' parse transform ;
537 ERROR: unable-to-fully-parse-ebnf remaining ;
539 ERROR: could-not-parse-ebnf ;
541 : check-parse-result ( result -- result )
543 dup remaining>> [ blank? ] trim [
544 unable-to-fully-parse-ebnf
550 : parse-ebnf ( string -- hashtable )
551 'ebnf' (parse) check-parse-result ast>> transform ;
553 : ebnf>quot ( string -- hashtable quot )
554 parse-ebnf dup dup parser [ main of compile ] with-variable
555 [ compiled-parse ] curry [ with-scope ast>> ] curry ;
561 reset-tokenizer parse-multiline-string parse-ebnf main of
562 suffix! reset-tokenizer ;
566 reset-tokenizer parse-multiline-string ebnf>quot nip
567 suffix! \ call suffix! reset-tokenizer ;
570 reset-tokenizer scan-new-word dup ";EBNF" parse-multiline-string
572 ( input -- ast ) define-declared "ebnf-parser" set-word-prop