basis/peg/ebnf/ebnf.factor

   1 ! Copyright (C) 2007 Chris Double.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors arrays assocs combinators
   4 combinators.short-circuit effects io.streams.string kernel make
   5 math.parser multiline namespaces parser peg peg.parsers
   6 peg.search quotations sequences splitting stack-checker strings
   7 summary unicode.categories words ;
   8 FROM: compiler.units => with-compilation-unit ;
   9 FROM: strings.parser => unescape-string ;
  10 FROM: vocabs.parser => search ;
  11 FROM: peg.search => replace ;
  12 IN: peg.ebnf
  13
  14 : rule ( name word -- parser )
  15     #! Given an EBNF word produced from EBNF: return the EBNF rule
  16     "ebnf-parser" word-prop at ;
  17
  18 ERROR: no-rule rule parser ;
  19
  20 <PRIVATE
  21
  22 : lookup-rule ( rule parser -- rule' )
  23         2dup rule [ 2nip ] [ no-rule ] if* ;
  24
  25 TUPLE: tokenizer-tuple any one many ;
  26
  27 : default-tokenizer ( -- tokenizer )
  28     T{ tokenizer-tuple f
  29         [ any-char ]
  30         [ token ]
  31         [ [ = ] curry any-char swap semantic ]
  32     } ;
  33
  34 : parser-tokenizer ( parser -- tokenizer )
  35     [ 1quotation ] keep
  36     [ swap [ = ] curry semantic ] curry dup \ tokenizer-tuple boa ;
  37
  38 : rule-tokenizer ( name word -- tokenizer )
  39     rule parser-tokenizer ;
  40
  41 : tokenizer ( -- word )
  42     \ tokenizer get-global [ default-tokenizer ] unless* ;
  43
  44 : reset-tokenizer ( -- )
  45     default-tokenizer \ tokenizer set-global ;
  46
  47 ERROR: no-tokenizer name ;
  48
  49 M: no-tokenizer summary
  50     drop "Tokenizer not found" ;
  51
  52 SYNTAX: TOKENIZER:
  53     scan-word-name dup search [ nip ] [ no-tokenizer ] if*
  54     execute( -- tokenizer ) \ tokenizer set-global ;
  55
  56 TUPLE: ebnf-non-terminal symbol ;
  57 TUPLE: ebnf-terminal symbol ;
  58 TUPLE: ebnf-foreign word rule ;
  59 TUPLE: ebnf-any-character ;
  60 TUPLE: ebnf-range pattern ;
  61 TUPLE: ebnf-ensure group ;
  62 TUPLE: ebnf-ensure-not group ;
  63 TUPLE: ebnf-choice options ;
  64 TUPLE: ebnf-sequence elements ;
  65 TUPLE: ebnf-ignore group ;
  66 TUPLE: ebnf-repeat0 group ;
  67 TUPLE: ebnf-repeat1 group ;
  68 TUPLE: ebnf-optional group ;
  69 TUPLE: ebnf-whitespace group ;
  70 TUPLE: ebnf-tokenizer elements ;
  71 TUPLE: ebnf-rule symbol elements ;
  72 TUPLE: ebnf-action parser code ;
  73 TUPLE: ebnf-var parser name ;
  74 TUPLE: ebnf-semantic parser code ;
  75 TUPLE: ebnf rules ;
  76
  77 C: <ebnf-non-terminal> ebnf-non-terminal
  78 C: <ebnf-terminal> ebnf-terminal
  79 C: <ebnf-foreign> ebnf-foreign
  80 C: <ebnf-any-character> ebnf-any-character
  81 C: <ebnf-range> ebnf-range
  82 C: <ebnf-ensure> ebnf-ensure
  83 C: <ebnf-ensure-not> ebnf-ensure-not
  84 C: <ebnf-choice> ebnf-choice
  85 C: <ebnf-sequence> ebnf-sequence
  86 C: <ebnf-ignore> ebnf-ignore
  87 C: <ebnf-repeat0> ebnf-repeat0
  88 C: <ebnf-repeat1> ebnf-repeat1
  89 C: <ebnf-optional> ebnf-optional
  90 C: <ebnf-whitespace> ebnf-whitespace
  91 C: <ebnf-tokenizer> ebnf-tokenizer
  92 C: <ebnf-rule> ebnf-rule
  93 C: <ebnf-action> ebnf-action
  94 C: <ebnf-var> ebnf-var
  95 C: <ebnf-semantic> ebnf-semantic
  96 C: <ebnf> ebnf
  97
  98 : filter-hidden ( seq -- seq )
  99     #! Remove elements that produce no AST from sequence
 100     [ ebnf-ensure-not? ] reject [ ebnf-ensure? not ] filter ;
 101
 102 : syntax ( string -- parser )
 103     #! Parses the string, ignoring white space, and
 104     #! does not put the result in the AST.
 105     token sp hide ;
 106
 107 : syntax-pack ( begin parser end -- parser )
 108     #! Parse 'parser' surrounded by syntax elements
 109     #! begin and end.
 110     [ syntax ] 2dip syntax pack ;
 111
 112 : insert-escapes ( string -- string )
 113     [
 114         "\t" token [ drop "\\t" ] action ,
 115         "\n" token [ drop "\\n" ] action ,
 116         "\r" token [ drop "\\r" ] action ,
 117     ] choice* replace ;
 118
 119 : 'identifier' ( -- parser )
 120     #! Return a parser that parses an identifer delimited by
 121     #! a quotation character. The quotation can be single
 122     #! or double quotes. The AST produced is the identifier
 123     #! between the quotes.
 124     [
 125         [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by ,
 126         [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
 127     ] choice* [ >string unescape-string ] action ;
 128
 129 : 'non-terminal' ( -- parser )
 130     #! A non-terminal is the name of another rule. It can
 131     #! be any non-blank character except for characters used
 132     #! in the EBNF syntax itself.
 133     [
 134         {
 135             [ blank?    ]
 136             [ CHAR: " = ]
 137             [ CHAR: ' = ]
 138             [ CHAR: | = ]
 139             [ CHAR: { = ]
 140             [ CHAR: } = ]
 141             [ CHAR: = = ]
 142             [ CHAR: ) = ]
 143             [ CHAR: ( = ]
 144             [ CHAR: ] = ]
 145             [ CHAR: [ = ]
 146             [ CHAR: . = ]
 147             [ CHAR: ! = ]
 148             [ CHAR: & = ]
 149             [ CHAR: * = ]
 150             [ CHAR: + = ]
 151             [ CHAR: ? = ]
 152             [ CHAR: : = ]
 153             [ CHAR: ~ = ]
 154             [ CHAR: < = ]
 155             [ CHAR: > = ]
 156         } 1|| not
 157     ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
 158
 159 : 'terminal' ( -- parser )
 160     #! A terminal is an identifier enclosed in quotations
 161     #! and it represents the literal value of the identifier.
 162     'identifier' [ <ebnf-terminal> ] action ;
 163
 164 : 'foreign-name' ( -- parser )
 165     #! Parse a valid foreign parser name
 166     [
 167         {
 168             [ blank?        ]
 169             [ CHAR: > = ]
 170         } 1|| not
 171     ] satisfy repeat1 [ >string ] action ;
 172
 173 : 'foreign' ( -- parser )
 174     #! A foreign call is a call to a rule in another ebnf grammar
 175     [
 176         "<foreign" syntax ,
 177         'foreign-name' sp ,
 178         'foreign-name' sp optional ,
 179         ">" syntax ,
 180     ] seq* [ first2 <ebnf-foreign> ] action ;
 181
 182 : 'any-character' ( -- parser )
 183     #! A parser to match the symbol for any character match.
 184     [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
 185
 186 : 'range-parser' ( -- parser )
 187     #! Match the syntax for declaring character ranges
 188     [
 189         [ "[" syntax , "[" token ensure-not , ] seq* hide ,
 190         [ CHAR: ] = not ] satisfy repeat1 ,
 191         "]" syntax ,
 192     ] seq* [ first >string unescape-string <ebnf-range> ] action ;
 193
 194 : ('element') ( -- parser )
 195     #! An element of a rule. It can be a terminal or a
 196     #! non-terminal but must not be followed by a "=".
 197     #! The latter indicates that it is the beginning of a
 198     #! new rule.
 199     [
 200         [
 201             [
 202                 'non-terminal' ,
 203                 'terminal' ,
 204                 'foreign' ,
 205                 'range-parser' ,
 206                 'any-character' ,
 207             ] choice*
 208             [ dup , "~" token hide , ] seq* [ first <ebnf-ignore> ] action ,
 209             [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
 210             [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
 211             [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
 212             ,
 213         ] choice* ,
 214         [
 215             "=" syntax ensure-not ,
 216             "=>" syntax ensure ,
 217         ] choice* ,
 218     ] seq* [ first ] action ;
 219
 220 DEFER: 'action'
 221
 222 : 'element' ( -- parser )
 223     [
 224         [
 225             ('element') , ":" syntax ,
 226             "a-zA-Z_" range-pattern
 227             "a-zA-Z0-9_-" range-pattern repeat1 2seq [ first2 swap prefix >string ] action ,
 228         ] seq* [ first2 <ebnf-var> ] action ,
 229         ('element') ,
 230     ] choice* ;
 231
 232 DEFER: 'choice'
 233
 234 : grouped ( quot suffix -- parser )
 235     #! Parse a group of choices, with a suffix indicating
 236     #! the type of group (repeat0, repeat1, etc) and
 237     #! an quot that is the action that produces the AST.
 238     2dup
 239     [
 240         "(" [ 'choice' sp ] delay ")" syntax-pack
 241         swap 2seq
 242         [ first ] rot compose action ,
 243         "{" [ 'choice' sp ] delay "}" syntax-pack
 244         swap 2seq
 245         [ first <ebnf-whitespace> ] rot compose action ,
 246     ] choice* ;
 247
 248 : 'group' ( -- parser )
 249     #! A grouping with no suffix. Used for precedence.
 250     [ ] [
 251         "~" token sp ensure-not ,
 252         "*" token sp ensure-not ,
 253         "+" token sp ensure-not ,
 254         "?" token sp ensure-not ,
 255     ] seq* hide grouped ;
 256
 257 : 'ignore' ( -- parser )
 258     [ <ebnf-ignore> ] "~" syntax grouped ;
 259
 260 : 'repeat0' ( -- parser )
 261     [ <ebnf-repeat0> ] "*" syntax grouped ;
 262
 263 : 'repeat1' ( -- parser )
 264     [ <ebnf-repeat1> ] "+" syntax grouped ;
 265
 266 : 'optional' ( -- parser )
 267     [ <ebnf-optional> ] "?" syntax grouped ;
 268
 269 : 'factor-code' ( -- parser )
 270     [
 271         "]]" token ensure-not ,
 272         "]?" token ensure-not ,
 273         [ drop t ] satisfy ,
 274     ] seq* repeat0 [ "" concat-as ] action ;
 275
 276 : 'ensure-not' ( -- parser )
 277     #! Parses the '!' syntax to ensure that
 278     #! something that matches the following elements do
 279     #! not exist in the parse stream.
 280     [
 281         "!" syntax ,
 282         'group' sp ,
 283     ] seq* [ first <ebnf-ensure-not> ] action ;
 284
 285 : 'ensure' ( -- parser )
 286     #! Parses the '&' syntax to ensure that
 287     #! something that matches the following elements does
 288     #! exist in the parse stream.
 289     [
 290         "&" syntax ,
 291         'group' sp ,
 292     ] seq* [ first <ebnf-ensure> ] action ;
 293
 294 : ('sequence') ( -- parser )
 295     #! A sequence of terminals and non-terminals, including
 296     #! groupings of those.
 297     [
 298         [
 299             'ensure-not' sp ,
 300             'ensure' sp ,
 301             'element' sp ,
 302             'group' sp ,
 303             'ignore' sp ,
 304             'repeat0' sp ,
 305             'repeat1' sp ,
 306             'optional' sp ,
 307         ] choice*
 308         [ dup    , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
 309         ,
 310     ] choice* ;
 311
 312 : 'action' ( -- parser )
 313      "[[" 'factor-code' "]]" syntax-pack ;
 314
 315 : 'semantic' ( -- parser )
 316      "?[" 'factor-code' "]?" syntax-pack ;
 317
 318 : 'sequence' ( -- parser )
 319     #! A sequence of terminals and non-terminals, including
 320     #! groupings of those.
 321     [
 322         [ ('sequence') , 'action' , ] seq*
 323         [ first2 <ebnf-action> ] action ,
 324
 325         [ ('sequence') , 'semantic' , ] seq*
 326         [ first2 <ebnf-semantic> ] action ,
 327
 328         ('sequence') ,
 329     ] choice* repeat1 [
 330          dup length 1 = [ first ] [ <ebnf-sequence> ] if
 331     ] action ;
 332
 333 : 'actioned-sequence' ( -- parser )
 334     [
 335         [ 'sequence' , "=>" syntax , 'action' , ] seq*
 336         [ first2 <ebnf-action> ] action ,
 337         'sequence' ,
 338     ] choice* ;
 339
 340 : 'choice' ( -- parser )
 341     'actioned-sequence' sp repeat1 [
 342         dup length 1 = [ first ] [ <ebnf-sequence> ] if
 343     ] action "|" token sp list-of [
 344         dup length 1 = [ first ] [ <ebnf-choice> ] if
 345     ] action ;
 346
 347 : 'tokenizer' ( -- parser )
 348     [
 349         "tokenizer" syntax ,
 350         "=" syntax ,
 351         ">" token ensure-not ,
 352         [ "default" token sp , 'choice' , ] choice* ,
 353     ] seq* [ first <ebnf-tokenizer> ] action ;
 354
 355 : 'rule' ( -- parser )
 356     [
 357         "tokenizer" token ensure-not ,
 358         'non-terminal' [ symbol>> ] action ,
 359         "=" syntax ,
 360         ">" token ensure-not ,
 361         'choice' ,
 362     ] seq* [ first2 <ebnf-rule> ] action ;
 363
 364 : 'ebnf' ( -- parser )
 365     [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;
 366
 367 GENERIC: (transform) ( ast -- parser )
 368
 369 SYMBOL: parser
 370 SYMBOL: main
 371 SYMBOL: ignore-ws
 372
 373 : transform ( ast -- object )
 374     H{ } clone dup dup [
 375         f ignore-ws set
 376         parser set
 377         swap (transform)
 378         main set
 379     ] with-variables ;
 380
 381 M: ebnf (transform) ( ast -- parser )
 382     rules>> [ (transform) ] map last ;
 383
 384 M: ebnf-tokenizer (transform) ( ast -- parser )
 385     elements>> dup "default" = [
 386         drop default-tokenizer \ tokenizer set-global any-char
 387     ] [
 388         (transform)
 389         dup parser-tokenizer \ tokenizer set-global
 390     ] if ;
 391
 392 ERROR: redefined-rule name ;
 393
 394 M: redefined-rule summary
 395     name>> "Rule '" "' defined more than once" surround ;
 396
 397 M: ebnf-rule (transform) ( ast -- parser )
 398     dup elements>>
 399     (transform) [
 400         swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
 401     ] keep ;
 402
 403 M: ebnf-sequence (transform) ( ast -- parser )
 404     #! If ignore-ws is set then each element of the sequence
 405     #! ignores leading whitespace. This is not inherited by
 406     #! subelements of the sequence.
 407     elements>> [
 408         f ignore-ws [ (transform) ] with-variable
 409         ignore-ws get [ sp ] when
 410     ] map seq [ dup length 1 = [ first ] when ] action ;
 411
 412 M: ebnf-choice (transform) ( ast -- parser )
 413     options>> [ (transform) ] map choice ;
 414
 415 M: ebnf-any-character (transform) ( ast -- parser )
 416     drop tokenizer any>> call( -- parser ) ;
 417
 418 M: ebnf-range (transform) ( ast -- parser )
 419     pattern>> range-pattern ;
 420
 421 : transform-group ( ast -- parser )
 422     #! convert a ast node with groups to a parser for that group
 423     group>> (transform) ;
 424
 425 M: ebnf-ensure (transform) ( ast -- parser )
 426     transform-group ensure ;
 427
 428 M: ebnf-ensure-not (transform) ( ast -- parser )
 429     transform-group ensure-not ;
 430
 431 M: ebnf-ignore (transform) ( ast -- parser )
 432     transform-group [ drop ignore ] action ;
 433
 434 M: ebnf-repeat0 (transform) ( ast -- parser )
 435     transform-group repeat0 ;
 436
 437 M: ebnf-repeat1 (transform) ( ast -- parser )
 438     transform-group repeat1 ;
 439
 440 M: ebnf-optional (transform) ( ast -- parser )
 441     transform-group optional ;
 442
 443 M: ebnf-whitespace (transform) ( ast -- parser )
 444     t ignore-ws [ transform-group ] with-variable ;
 445
 446 GENERIC: build-locals ( code ast -- code )
 447
 448 M: ebnf-sequence build-locals ( code ast -- code )
 449     #! Note the need to filter out this ebnf items that
 450     #! leave nothing in the AST
 451     elements>> filter-hidden dup length 1 = [
 452         first build-locals
 453     ] [
 454         dup [ ebnf-var? ] any? not [
 455             drop
 456         ] [
 457             [
 458                 "FROM: locals => [let :> ; FROM: sequences => nth ; FROM: kernel => nip over ; [let " %
 459                 [
 460                     over ebnf-var? [
 461                         " " % # " over nth :> " %
 462                         name>> %
 463                     ] [
 464                         2drop
 465                     ] if
 466                 ] each-index
 467                 " " %
 468                 %
 469                 " nip ]" %
 470              ] "" make
 471         ] if
 472     ] if ;
 473
 474 M: ebnf-var build-locals ( code ast -- code )
 475     [
 476         "FROM: locals => [let :> ; FROM: kernel => dup nip ; [let " %
 477         " dup :> " % name>> %
 478         " " %
 479         %
 480         " nip ]" %
 481     ] "" make ;
 482
 483 M: object build-locals ( code ast -- code )
 484     drop ;
 485
 486 ERROR: bad-effect quot effect ;
 487
 488 : check-action-effect ( quot -- quot )
 489     dup infer {
 490         { [ dup ( a -- b ) effect<= ] [ drop ] }
 491         { [ dup ( -- b ) effect<= ] [ drop [ drop ] prepose ] }
 492         [ bad-effect ]
 493     } cond ;
 494
 495 : ebnf-transform ( ast -- parser quot )
 496     [ parser>> (transform) ]
 497     [ code>> insert-escapes ]
 498     [ parser>> ] tri build-locals
 499     string-lines parse-lines ;
 500
 501 M: ebnf-action (transform) ( ast -- parser )
 502     ebnf-transform check-action-effect action ;
 503
 504 M: ebnf-semantic (transform) ( ast -- parser )
 505     ebnf-transform semantic ;
 506
 507 M: ebnf-var (transform) ( ast -- parser )
 508     parser>> (transform) ;
 509
 510 M: ebnf-terminal (transform) ( ast -- parser )
 511     symbol>> tokenizer one>> call( symbol -- parser ) ;
 512
 513 ERROR: ebnf-foreign-not-found name ;
 514
 515 M: ebnf-foreign-not-found summary
 516     name>> "Foreign word '" "' not found" surround ;
 517
 518 M: ebnf-foreign (transform) ( ast -- parser )
 519     dup word>> search [ word>> ebnf-foreign-not-found ] unless*
 520     swap rule>> [ main ] unless* over rule [
 521         nip
 522     ] [
 523         execute( -- parser )
 524     ] if* ;
 525
 526 ERROR: parser-not-found name ;
 527
 528 M: ebnf-non-terminal (transform) ( ast -- parser )
 529     symbol>> [
 530         , \ dup , parser get , \ at ,
 531         [ parser-not-found ] , \ unless* , \ nip ,
 532     ] [ ] make box ;
 533
 534 : transform-ebnf ( string -- object )
 535     'ebnf' parse transform ;
 536
 537 ERROR: unable-to-fully-parse-ebnf remaining ;
 538
 539 ERROR: could-not-parse-ebnf ;
 540
 541 : check-parse-result ( result -- result )
 542     [
 543         dup remaining>> [ blank? ] trim [
 544             unable-to-fully-parse-ebnf
 545         ] unless-empty
 546     ] [
 547         could-not-parse-ebnf
 548     ] if* ;
 549
 550 : parse-ebnf ( string -- hashtable )
 551     'ebnf' (parse) check-parse-result ast>> transform ;
 552
 553 : ebnf>quot ( string -- hashtable quot )
 554     parse-ebnf dup dup parser [ main of compile ] with-variable
 555     [ compiled-parse ] curry [ with-scope ast>> ] curry ;
 556
 557 PRIVATE>
 558
 559 SYNTAX: <EBNF
 560     "EBNF>"
 561     reset-tokenizer parse-multiline-string parse-ebnf main of
 562     suffix! reset-tokenizer ;
 563
 564 SYNTAX: [EBNF
 565     "EBNF]"
 566     reset-tokenizer parse-multiline-string ebnf>quot nip
 567     suffix! \ call suffix! reset-tokenizer ;
 568
 569 SYNTAX: EBNF:
 570     reset-tokenizer scan-new-word dup ";EBNF" parse-multiline-string
 571     ebnf>quot swapd
 572     ( input -- ast ) define-declared "ebnf-parser" set-word-prop
 573     reset-tokenizer ;