1 ! Copyright (C) 2007 Chris Double.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators
4 combinators.short-circuit effects io.streams.string kernel make
5 math.parser multiline namespaces parser peg peg.parsers
6 peg.search quotations sequences splitting stack-checker strings
7 strings.parser summary unicode.categories words ;
8 FROM: vocabs.parser => search ;
9 FROM: peg.search => replace ;
12 : rule ( name word -- parser )
13 #! Given an EBNF word produced from EBNF: return the EBNF rule
14 "ebnf-parser" word-prop at ;
16 ERROR: no-rule rule parser ;
20 : lookup-rule ( rule parser -- rule' )
21 2dup rule [ 2nip ] [ throw-no-rule ] if* ;
23 TUPLE: tokenizer-tuple any one many ;
25 : default-tokenizer ( -- tokenizer )
29 [ [ = ] curry any-char swap semantic ]
32 : parser-tokenizer ( parser -- tokenizer )
34 [ swap [ = ] curry semantic ] curry dup \ tokenizer-tuple boa ;
36 : rule-tokenizer ( name word -- tokenizer )
37 rule parser-tokenizer ;
39 : tokenizer ( -- word )
40 \ tokenizer get-global [ default-tokenizer ] unless* ;
42 : reset-tokenizer ( -- )
43 default-tokenizer \ tokenizer set-global ;
45 ERROR: no-tokenizer name ;
47 M: no-tokenizer summary
48 drop "Tokenizer not found" ;
51 scan-word-name dup search [ nip ] [ throw-no-tokenizer ] if*
52 execute( -- tokenizer ) \ tokenizer set-global ;
54 TUPLE: ebnf-non-terminal symbol ;
55 TUPLE: ebnf-terminal symbol ;
56 TUPLE: ebnf-foreign word rule ;
57 TUPLE: ebnf-any-character ;
58 TUPLE: ebnf-range pattern ;
59 TUPLE: ebnf-ensure group ;
60 TUPLE: ebnf-ensure-not group ;
61 TUPLE: ebnf-choice options ;
62 TUPLE: ebnf-sequence elements ;
63 TUPLE: ebnf-ignore group ;
64 TUPLE: ebnf-repeat0 group ;
65 TUPLE: ebnf-repeat1 group ;
66 TUPLE: ebnf-optional group ;
67 TUPLE: ebnf-whitespace group ;
68 TUPLE: ebnf-tokenizer elements ;
69 TUPLE: ebnf-rule symbol elements ;
70 TUPLE: ebnf-action parser code ;
71 TUPLE: ebnf-var parser name ;
72 TUPLE: ebnf-semantic parser code ;
75 C: <ebnf-non-terminal> ebnf-non-terminal
76 C: <ebnf-terminal> ebnf-terminal
77 C: <ebnf-foreign> ebnf-foreign
78 C: <ebnf-any-character> ebnf-any-character
79 C: <ebnf-range> ebnf-range
80 C: <ebnf-ensure> ebnf-ensure
81 C: <ebnf-ensure-not> ebnf-ensure-not
82 C: <ebnf-choice> ebnf-choice
83 C: <ebnf-sequence> ebnf-sequence
84 C: <ebnf-ignore> ebnf-ignore
85 C: <ebnf-repeat0> ebnf-repeat0
86 C: <ebnf-repeat1> ebnf-repeat1
87 C: <ebnf-optional> ebnf-optional
88 C: <ebnf-whitespace> ebnf-whitespace
89 C: <ebnf-tokenizer> ebnf-tokenizer
90 C: <ebnf-rule> ebnf-rule
91 C: <ebnf-action> ebnf-action
92 C: <ebnf-var> ebnf-var
93 C: <ebnf-semantic> ebnf-semantic
96 : filter-hidden ( seq -- seq )
97 #! Remove elements that produce no AST from sequence
98 [ ebnf-ensure-not? ] reject [ ebnf-ensure? not ] filter ;
100 : syntax ( string -- parser )
101 #! Parses the string, ignoring white space, and
102 #! does not put the result in the AST.
105 : syntax-pack ( begin parser end -- parser )
106 #! Parse 'parser' surrounded by syntax elements
108 [ syntax ] 2dip syntax pack ;
110 : insert-escapes ( string -- string )
112 "\t" token [ drop "\\t" ] action ,
113 "\n" token [ drop "\\n" ] action ,
114 "\r" token [ drop "\\r" ] action ,
117 : 'identifier' ( -- parser )
118 #! Return a parser that parses an identifer delimited by
119 #! a quotation character. The quotation can be single
120 #! or double quotes. The AST produced is the identifier
121 #! between the quotes.
123 [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by ,
124 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
125 ] choice* [ >string unescape-string ] action ;
127 : 'non-terminal' ( -- parser )
128 #! A non-terminal is the name of another rule. It can
129 #! be any non-blank character except for characters used
130 #! in the EBNF syntax itself.
155 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
157 : 'terminal' ( -- parser )
158 #! A terminal is an identifier enclosed in quotations
159 #! and it represents the literal value of the identifier.
160 'identifier' [ <ebnf-terminal> ] action ;
162 : 'foreign-name' ( -- parser )
163 #! Parse a valid foreign parser name
169 ] satisfy repeat1 [ >string ] action ;
171 : 'foreign' ( -- parser )
172 #! A foreign call is a call to a rule in another ebnf grammar
176 'foreign-name' sp optional ,
178 ] seq* [ first2 <ebnf-foreign> ] action ;
180 : 'any-character' ( -- parser )
181 #! A parser to match the symbol for any character match.
182 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
184 : 'range-parser' ( -- parser )
185 #! Match the syntax for declaring character ranges
187 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
188 [ CHAR: ] = not ] satisfy repeat1 ,
190 ] seq* [ first >string unescape-string <ebnf-range> ] action ;
192 : ('element') ( -- parser )
193 #! An element of a rule. It can be a terminal or a
194 #! non-terminal but must not be followed by a "=".
195 #! The latter indicates that it is the beginning of a
206 [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
207 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
208 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
209 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
213 "=" syntax ensure-not ,
216 ] seq* [ first ] action ;
220 : 'element' ( -- parser )
223 ('element') , ":" syntax ,
224 "a-zA-Z_" range-pattern
225 "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
226 ] seq* [ first2 <ebnf-var> ] action ,
232 : grouped ( quot suffix -- parser )
233 #! Parse a group of choices, with a suffix indicating
234 #! the type of group (repeat0, repeat1, etc) and
235 #! an quot that is the action that produces the AST.
238 "(" [ 'choice' sp ] delay ")" syntax-pack
240 [ first ] rot compose action ,
241 "{" [ 'choice' sp ] delay "}" syntax-pack
243 [ first <ebnf-whitespace> ] rot compose action ,
246 : 'group' ( -- parser )
247 #! A grouping with no suffix. Used for precedence.
249 "~" token sp ensure-not ,
250 "*" token sp ensure-not ,
251 "+" token sp ensure-not ,
252 "?" token sp ensure-not ,
253 ] seq* hide grouped ;
255 : 'ignore' ( -- parser )
256 [ <ebnf-ignore> ] "~" syntax grouped ;
258 : 'repeat0' ( -- parser )
259 [ <ebnf-repeat0> ] "*" syntax grouped ;
261 : 'repeat1' ( -- parser )
262 [ <ebnf-repeat1> ] "+" syntax grouped ;
264 : 'optional' ( -- parser )
265 [ <ebnf-optional> ] "?" syntax grouped ;
267 : 'factor-code' ( -- parser )
269 "]]" token ensure-not ,
270 "]?" token ensure-not ,
272 ] seq* repeat0 [ "" concat-as ] action ;
274 : 'ensure-not' ( -- parser )
275 #! Parses the '!' syntax to ensure that
276 #! something that matches the following elements do
277 #! not exist in the parse stream.
281 ] seq* [ first <ebnf-ensure-not> ] action ;
283 : 'ensure' ( -- parser )
284 #! Parses the '&' syntax to ensure that
285 #! something that matches the following elements does
286 #! exist in the parse stream.
290 ] seq* [ first <ebnf-ensure> ] action ;
292 : ('sequence') ( -- parser )
293 #! A sequence of terminals and non-terminals, including
294 #! groupings of those.
306 [ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
310 : 'action' ( -- parser )
311 "[[" 'factor-code' "]]" syntax-pack ;
313 : 'semantic' ( -- parser )
314 "?[" 'factor-code' "]?" syntax-pack ;
316 : 'sequence' ( -- parser )
317 #! A sequence of terminals and non-terminals, including
318 #! groupings of those.
320 [ ('sequence') , 'action' , ] seq*
321 [ first2 <ebnf-action> ] action ,
323 [ ('sequence') , 'semantic' , ] seq*
324 [ first2 <ebnf-semantic> ] action ,
328 dup length 1 = [ first ] [ <ebnf-sequence> ] if
331 : 'actioned-sequence' ( -- parser )
333 [ 'sequence' , "=>" syntax , 'action' , ] seq*
334 [ first2 <ebnf-action> ] action ,
338 : 'choice' ( -- parser )
339 'actioned-sequence' sp repeat1 [
340 dup length 1 = [ first ] [ <ebnf-sequence> ] if
341 ] action "|" token sp list-of [
342 dup length 1 = [ first ] [ <ebnf-choice> ] if
345 : 'tokenizer' ( -- parser )
349 ">" token ensure-not ,
350 [ "default" token sp , 'choice' , ] choice* ,
351 ] seq* [ first <ebnf-tokenizer> ] action ;
353 : 'rule' ( -- parser )
355 "tokenizer" token ensure-not ,
356 'non-terminal' [ symbol>> ] action ,
358 ">" token ensure-not ,
360 ] seq* [ first2 <ebnf-rule> ] action ;
362 : 'ebnf' ( -- parser )
363 [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;
365 GENERIC: (transform) ( ast -- parser )
371 : transform ( ast -- object )
379 M: ebnf (transform) ( ast -- parser )
380 rules>> [ (transform) ] map last ;
382 M: ebnf-tokenizer (transform) ( ast -- parser )
383 elements>> dup "default" = [
384 drop default-tokenizer \ tokenizer set-global any-char
387 dup parser-tokenizer \ tokenizer set-global
390 ERROR: redefined-rule name ;
392 M: redefined-rule summary
393 name>> "Rule '" "' defined more than once" surround ;
395 M: ebnf-rule (transform) ( ast -- parser )
398 swap symbol>> dup get parser? [ throw-redefined-rule ] [ set ] if
401 M: ebnf-sequence (transform) ( ast -- parser )
402 #! If ignore-ws is set then each element of the sequence
403 #! ignores leading whitespace. This is not inherited by
404 #! subelements of the sequence.
406 f ignore-ws [ (transform) ] with-variable
407 ignore-ws get [ sp ] when
408 ] map seq [ dup length 1 = [ first ] when ] action ;
410 M: ebnf-choice (transform) ( ast -- parser )
411 options>> [ (transform) ] map choice ;
413 M: ebnf-any-character (transform) ( ast -- parser )
414 drop tokenizer any>> call( -- parser ) ;
416 M: ebnf-range (transform) ( ast -- parser )
417 pattern>> range-pattern ;
419 : transform-group ( ast -- parser )
420 #! convert a ast node with groups to a parser for that group
421 group>> (transform) ;
423 M: ebnf-ensure (transform) ( ast -- parser )
424 transform-group ensure ;
426 M: ebnf-ensure-not (transform) ( ast -- parser )
427 transform-group ensure-not ;
429 M: ebnf-ignore (transform) ( ast -- parser )
430 transform-group [ drop ignore ] action ;
432 M: ebnf-repeat0 (transform) ( ast -- parser )
433 transform-group repeat0 ;
435 M: ebnf-repeat1 (transform) ( ast -- parser )
436 transform-group repeat1 ;
438 M: ebnf-optional (transform) ( ast -- parser )
439 transform-group optional ;
441 M: ebnf-whitespace (transform) ( ast -- parser )
442 t ignore-ws [ transform-group ] with-variable ;
444 GENERIC: build-locals ( code ast -- code )
446 M: ebnf-sequence build-locals ( code ast -- code )
447 #! Note the need to filter out this ebnf items that
448 #! leave nothing in the AST
449 elements>> filter-hidden dup length 1 = [
452 dup [ ebnf-var? ] any? not [
456 "FROM: locals => [let :> ; FROM: sequences => nth ; FROM: kernel => nip over ; [let " %
459 " " % # " over nth :> " %
472 M: ebnf-var build-locals ( code ast -- code )
474 "FROM: locals => [let :> ; FROM: kernel => dup nip ; [let " %
475 " dup :> " % name>> %
481 M: object build-locals ( code ast -- code )
484 ERROR: bad-effect quot effect ;
486 : check-action-effect ( quot -- quot )
488 { [ dup ( a -- b ) effect<= ] [ drop ] }
489 { [ dup ( -- b ) effect<= ] [ drop [ drop ] prepose ] }
493 : ebnf-transform ( ast -- parser quot )
494 [ parser>> (transform) ]
495 [ code>> insert-escapes ]
496 [ parser>> ] tri build-locals
497 string-lines parse-lines ;
499 M: ebnf-action (transform) ( ast -- parser )
500 ebnf-transform check-action-effect action ;
502 M: ebnf-semantic (transform) ( ast -- parser )
503 ebnf-transform semantic ;
505 M: ebnf-var (transform) ( ast -- parser )
506 parser>> (transform) ;
508 M: ebnf-terminal (transform) ( ast -- parser )
509 symbol>> tokenizer one>> call( symbol -- parser ) ;
511 ERROR: ebnf-foreign-not-found name ;
513 M: ebnf-foreign-not-found summary
514 name>> "Foreign word '" "' not found" surround ;
516 M: ebnf-foreign (transform) ( ast -- parser )
517 dup word>> search [ word>> ebnf-foreign-not-found ] unless*
518 swap rule>> [ main ] unless* over rule [
524 ERROR: parser-not-found name ;
526 M: ebnf-non-terminal (transform) ( ast -- parser )
528 , \ dup , parser get , \ at ,
529 [ parser-not-found ] , \ unless* , \ nip ,
532 : transform-ebnf ( string -- object )
533 'ebnf' parse transform ;
535 ERROR: unable-to-fully-parse-ebnf remaining ;
537 ERROR: could-not-parse-ebnf ;
539 : check-parse-result ( result -- result )
541 dup remaining>> [ blank? ] trim [
542 unable-to-fully-parse-ebnf
548 : parse-ebnf ( string -- hashtable )
549 'ebnf' (parse) check-parse-result ast>> transform ;
551 : ebnf>quot ( string -- hashtable quot )
552 parse-ebnf dup dup parser [ main of compile ] with-variable
553 [ compiled-parse ] curry [ with-scope ast>> ] curry ;
559 reset-tokenizer parse-multiline-string parse-ebnf main of
560 suffix! reset-tokenizer ;
564 reset-tokenizer parse-multiline-string ebnf>quot nip
565 suffix! \ call suffix! reset-tokenizer ;
568 reset-tokenizer scan-new-word dup ";EBNF" parse-multiline-string
570 ( input -- ast ) define-declared "ebnf-parser" set-word-prop