basis/peg/peg.factor

   1 ! Copyright (C) 2007, 2008 Chris Double.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: kernel sequences strings fry namespaces make math assocs
   4 io vectors arrays math.parser math.order combinators classes
   5 sets unicode.categories compiler.units parser effects.parser
   6 words quotations memoize accessors locals splitting
   7 combinators.short-circuit generalizations ;
   8 FROM: namespaces => set ;
   9 IN: peg
  10
  11 TUPLE: parse-result remaining ast ;
  12 TUPLE: parse-error position messages ;
  13 TUPLE: parser peg compiled id ;
  14
  15 M: parser equal? { [ [ class-of ] same? ] [ [ id>> ] same? ] } 2&& ;
  16 M: parser hashcode* id>> hashcode* ;
  17
  18 C: <parse-result> parse-result
  19 C: <parse-error>  parse-error
  20
  21 SYMBOL: error-stack
  22
  23 : (merge-errors) ( a b -- c )
  24     {
  25         { [ over position>> not ] [ nip ] }
  26         { [ dup  position>> not ] [ drop ] }
  27         [
  28             2dup [ position>> ] compare {
  29                 { +lt+ [ nip ] }
  30                 { +gt+ [ drop ] }
  31                 { +eq+ [ messages>> over messages>> union [ position>> ] dip <parse-error> ] }
  32             } case
  33         ]
  34     } cond ;
  35
  36 : merge-errors ( -- )
  37     error-stack get dup length 1 >  [
  38         dup pop over pop swap (merge-errors) swap push
  39     ] [
  40         drop
  41     ] if ;
  42
  43 : add-error ( remaining message -- )
  44     <parse-error> error-stack get push ;
  45
  46 SYMBOL: ignore
  47
  48 : packrat ( id -- cache )
  49     #! The packrat cache is a mapping of parser-id->cache.
  50     #! For each parser it maps to a cache holding a mapping
  51     #! of position->result. The packrat cache therefore keeps
  52     #! track of all parses that have occurred at each position
  53     #! of the input string and the results obtained from that
  54     #! parser.
  55     \ packrat get [ drop H{ } clone ] cache ;
  56
  57 SYMBOL: pos
  58 SYMBOL: input
  59 SYMBOL: fail
  60 SYMBOL: lrstack
  61
  62 : heads ( -- cache )
  63     #! A mapping from position->peg-head. It maps a
  64     #! position in the input string being parsed to
  65     #! the head of the left recursion which is currently
  66     #! being grown. It is 'f' at any position where
  67     #! left recursion growth is not underway.
  68     \ heads get ;
  69
  70 : failed? ( obj -- ? )
  71     fail = ;
  72
  73 : peg-cache ( -- cache )
  74     #! Holds a hashtable mapping a peg tuple to
  75     #! the parser tuple for that peg. The parser tuple
  76     #! holds a unique id and the compiled form of that peg.
  77     \ peg-cache get-global [
  78         H{ } clone dup \ peg-cache set-global
  79     ] unless* ;
  80
  81 : reset-pegs ( -- )
  82     H{ } clone \ peg-cache set-global ;
  83
  84 reset-pegs
  85
  86 #! An entry in the table of memoized parse results
  87 #! ast = an AST produced from the parse
  88 #!       or the symbol 'fail'
  89 #!       or a left-recursion object
  90 #! pos = the position in the input string of this entry
  91 TUPLE: memo-entry ans pos ;
  92
  93 TUPLE: left-recursion seed rule-id head next ;
  94 TUPLE: peg-head rule-id involved-set eval-set ;
  95
  96 : rule-id ( word -- id )
  97     #! A rule is the parser compiled down to a word. It has
  98     #! a "peg-id" property containing the id of the original parser.
  99     "peg-id" word-prop ;
 100
 101 : input-slice ( -- slice )
 102     #! Return a slice of the input from the current parse position
 103     input get pos get tail-slice ;
 104
 105 : input-from ( input -- n )
 106     #! Return the index from the original string that the
 107     #! input slice is based on.
 108     dup slice? [ from>> ] [ drop 0 ] if ;
 109
 110 : process-rule-result ( p result -- result )
 111     [
 112         nip [ ast>> ] [ remaining>> ] bi input-from pos set
 113     ] [
 114         pos set fail
 115     ] if* ;
 116
 117 : eval-rule ( rule -- ast )
 118     #! Evaluate a rule, return an ast resulting from it.
 119     #! Return fail if the rule failed. The rule has
 120     #! stack effect ( -- parse-result )
 121     pos get swap execute( -- parse-result ) process-rule-result ; inline
 122
 123 : memo ( pos id -- memo-entry )
 124     #! Return the result from the memo cache.
 125     packrat at ;
 126
 127 : set-memo ( memo-entry pos id -- )
 128     #! Store an entry in the cache
 129     packrat set-at ;
 130
 131 : update-m ( ast m -- )
 132     swap >>ans pos get >>pos drop ;
 133
 134 : stop-growth? ( ast m -- ? )
 135     [ failed? pos get ] dip
 136     pos>> <= or ;
 137
 138 : setup-growth ( h p -- )
 139     pos set dup involved-set>> clone >>eval-set drop ;
 140
 141 : (grow-lr) ( h p r: ( -- result ) m -- )
 142     [ [ setup-growth ] 2keep ] 2dip
 143     [ dup eval-rule ] dip swap
 144         dup pick stop-growth? [
 145         5 ndrop
 146     ] [
 147         over update-m
 148         (grow-lr)
 149     ] if ; inline recursive
 150
 151 : grow-lr ( h p r m -- ast )
 152     [ [ heads set-at ] 2keep ] 2dip
 153     pick over [ (grow-lr) ] 2dip
 154     swap heads delete-at
 155     dup pos>> pos set ans>>
 156     ; inline
 157
 158 :: (setup-lr) ( l s -- )
 159     s [
 160         s left-recursion? [ s throw ] unless
 161         s head>> l head>> eq? [
 162             l head>> s head<<
 163             l head>> [ s rule-id>> suffix ] change-involved-set drop
 164             l s next>> (setup-lr)
 165         ] unless
 166     ] when ;
 167
 168 :: setup-lr ( r l -- )
 169     l head>> [
 170         r rule-id V{ } clone V{ } clone peg-head boa l head<<
 171     ] unless
 172     l lrstack get (setup-lr) ;
 173
 174 :: lr-answer ( r p m -- ast )
 175     m ans>> head>> :> h
 176     h rule-id>> r rule-id eq? [
 177         m ans>> seed>> m ans<<
 178         m ans>> failed? [
 179             fail
 180         ] [
 181             h p r m grow-lr
 182         ] if
 183     ] [
 184         m ans>> seed>>
 185     ] if ; inline
 186
 187 :: recall ( r p -- memo-entry )
 188     p r rule-id memo :> m
 189     p heads at :> h
 190     h [
 191         m r rule-id h involved-set>> h rule-id>> suffix member? not and [
 192             fail p memo-entry boa
 193         ] [
 194             r rule-id h eval-set>> member? [
 195                 h [ r rule-id swap remove ] change-eval-set drop
 196                 r eval-rule
 197                 m update-m
 198                 m
 199             ] [
 200                 m
 201             ] if
 202         ] if
 203     ] [
 204         m
 205     ] if ; inline
 206
 207 :: apply-non-memo-rule ( r p -- ast )
 208     fail r rule-id f lrstack get left-recursion boa :> lr
 209     lr lrstack set lr p memo-entry boa dup p r rule-id set-memo :> m
 210     r eval-rule :> ans
 211     lrstack get next>> lrstack set
 212     pos get m pos<<
 213     lr head>> [
 214         m ans>> left-recursion? [
 215             ans lr seed<<
 216             r p m lr-answer
 217         ] [ ans ] if
 218     ] [
 219         ans m ans<<
 220         ans
 221     ] if ; inline
 222
 223 : apply-memo-rule ( r m -- ast )
 224     [ ans>> ] [ pos>> ] bi pos set
 225     dup left-recursion? [
 226         [ setup-lr ] keep seed>>
 227     ] [
 228         nip
 229     ] if ;
 230
 231 : apply-rule ( r p -- ast )
 232     2dup recall [
 233         nip apply-memo-rule
 234     ] [
 235         apply-non-memo-rule
 236     ] if* ; inline
 237
 238 : with-packrat ( input quot -- result )
 239     #! Run the quotation with a packrat cache active.
 240     [
 241         swap input ,,
 242         0 pos ,,
 243         f lrstack ,,
 244         V{ } clone error-stack ,,
 245         H{ } clone \ heads ,,
 246         H{ } clone \ packrat ,,
 247     ] H{ } make swap with-variables ; inline
 248
 249 GENERIC: (compile) ( peg -- quot )
 250
 251 : process-parser-result ( result -- result )
 252     dup failed? [
 253         drop f
 254     ] [
 255         input-slice swap <parse-result>
 256     ] if ;
 257
 258 : execute-parser ( word -- result )
 259     pos get apply-rule process-parser-result ;
 260
 261 : preset-parser-word ( parser -- parser word )
 262     gensym [ >>compiled ] keep ;
 263
 264 : define-parser-word ( parser word -- )
 265     #! Return the body of the word that is the compiled version
 266     #! of the parser.
 267     2dup swap peg>> (compile) ( -- result ) define-declared
 268     swap id>> "peg-id" set-word-prop ;
 269
 270 : compile-parser ( parser -- word )
 271     #! Look to see if the given parser has been compiled.
 272     #! If not, compile it to a temporary word, cache it,
 273     #! and return it. Otherwise return the existing one.
 274     #! Circular parsers are supported by getting the word
 275     #! name and storing it in the cache, before compiling,
 276     #! so it is picked up when re-entered.
 277     dup compiled>> [
 278         nip
 279     ] [
 280         preset-parser-word [ define-parser-word ] keep
 281     ] if* ;
 282
 283 : compile-parser-quot ( parser -- quot )
 284     compile-parser [ execute-parser ] curry ;
 285
 286 SYMBOL: delayed
 287
 288 : fixup-delayed ( -- )
 289     #! Work through all delayed parsers and recompile their
 290     #! words to have the correct bodies.
 291     delayed get [
 292         call( -- parser ) compile-parser-quot ( -- result ) define-declared
 293     ] assoc-each ;
 294
 295 : compile ( parser -- word )
 296     [
 297         H{ } clone delayed [
 298             compile-parser-quot ( -- result ) define-temp fixup-delayed
 299         ] with-variable
 300     ] with-compilation-unit ;
 301
 302 : compiled-parse ( state word -- result )
 303     swap [ execute( -- result ) [ error-stack get first throw ] unless* ] with-packrat ;
 304
 305 : (parse) ( input parser -- result )
 306     dup word? [ compile ] unless compiled-parse ;
 307
 308 : parse ( input parser -- ast )
 309     (parse) ast>> ;
 310
 311 <PRIVATE
 312
 313 SYMBOL: id
 314
 315 : next-id ( -- n )
 316     #! Return the next unique id for a parser
 317     id get-global [
 318         dup 1 + id set-global
 319     ] [
 320         1 id set-global 0
 321     ] if* ;
 322
 323 : wrap-peg ( peg -- parser )
 324     #! Wrap a parser tuple around the peg object.
 325     #! Look for an existing parser tuple for that
 326     #! peg object.
 327     peg-cache [
 328         f next-id parser boa
 329     ] cache ;
 330
 331 TUPLE: token-parser symbol ;
 332
 333 : parse-token ( input string -- result )
 334     #! Parse the string, returning a parse result
 335     [ ?head-slice ] keep swap [
 336         <parse-result> f f add-error
 337     ] [
 338         [ drop pos get "token '" ] dip append "'" append 1vector add-error f
 339     ] if ;
 340
 341 M: token-parser (compile) ( peg -- quot )
 342     symbol>> '[ input-slice _ parse-token ] ;
 343
 344 TUPLE: satisfy-parser quot ;
 345
 346 : parse-satisfy ( input quot -- result )
 347     swap dup empty? [
 348         2drop f
 349     ] [
 350         unclip-slice rot dupd call [
 351             <parse-result>
 352         ] [
 353             2drop f
 354         ] if
 355     ] if ; inline
 356
 357
 358 M: satisfy-parser (compile) ( peg -- quot )
 359     quot>> '[ input-slice _ parse-satisfy ] ;
 360
 361 TUPLE: range-parser min max ;
 362
 363 : parse-range ( input min max -- result )
 364     pick empty? [
 365         3drop f
 366     ] [
 367         [ dup first ] 2dip between? [
 368             unclip-slice <parse-result>
 369         ] [
 370             drop f
 371         ] if
 372     ] if ;
 373
 374 M: range-parser (compile) ( peg -- quot )
 375     [ min>> ] [ max>> ] bi '[ input-slice _ _ parse-range ] ;
 376
 377 TUPLE: seq-parser parsers ;
 378
 379 : ignore? ( ast -- bool )
 380     ignore = ;
 381
 382 : calc-seq-result ( prev-result current-result -- next-result )
 383     [
 384         [ remaining>> swap remaining<< ] 2keep
 385         ast>> dup ignore? [
 386             drop
 387         ] [
 388             swap [ ast>> push ] keep
 389         ] if
 390     ] [
 391         drop f
 392     ] if* ;
 393
 394 : parse-seq-element ( result quot -- result )
 395     over [
 396         call calc-seq-result
 397     ] [
 398         2drop f
 399     ] if ; inline
 400
 401 M: seq-parser (compile) ( peg -- quot )
 402     [
 403         [ input-slice V{ } clone <parse-result> ] %
 404         [
 405             parsers>> unclip compile-parser-quot [ parse-seq-element ] curry ,
 406             [ compile-parser-quot [ merge-errors ] compose [ parse-seq-element ] curry , ] each
 407         ] { } make , \ 1&& ,
 408     ] [ ] make ;
 409
 410 TUPLE: choice-parser parsers ;
 411
 412 M: choice-parser (compile) ( peg -- quot )
 413     [
 414         [
 415             parsers>> [ compile-parser-quot ] map
 416             unclip , [ [ merge-errors ] compose , ] each
 417         ] { } make , \ 0|| ,
 418     ] [ ] make ;
 419
 420 TUPLE: repeat0-parser p1 ;
 421
 422 : (repeat) ( quot: ( -- result ) result -- result )
 423     over call [
 424         [ remaining>> swap remaining<< ] 2keep
 425         ast>> swap [ ast>> push ] keep
 426         (repeat)
 427     ] [
 428         nip
 429     ] if* ; inline recursive
 430
 431 M: repeat0-parser (compile) ( peg -- quot )
 432     p1>> compile-parser-quot '[
 433         input-slice V{ } clone <parse-result> _ swap (repeat)
 434     ] ;
 435
 436 TUPLE: repeat1-parser p1 ;
 437
 438 : repeat1-empty-check ( result -- result )
 439     [
 440         dup ast>> empty? [ drop f ] when
 441     ] [
 442         f
 443     ] if* ;
 444
 445 M: repeat1-parser (compile) ( peg -- quot )
 446     p1>> compile-parser-quot '[
 447         input-slice V{ } clone <parse-result> _ swap (repeat) repeat1-empty-check
 448     ] ;
 449
 450 TUPLE: optional-parser p1 ;
 451
 452 : check-optional ( result -- result )
 453       [ input-slice f <parse-result> ] unless* ;
 454
 455 M: optional-parser (compile) ( peg -- quot )
 456       p1>> compile-parser-quot '[ @ check-optional ] ;
 457
 458 TUPLE: semantic-parser p1 quot ;
 459
 460 : check-semantic ( result quot -- result )
 461     over [
 462         over ast>> swap call [ drop f ] unless
 463     ] [
 464         drop
 465     ] if ; inline
 466
 467 M: semantic-parser (compile) ( peg -- quot )
 468     [ p1>> compile-parser-quot ] [ quot>> ] bi
 469     '[ @ _ check-semantic ] ;
 470
 471 TUPLE: ensure-parser p1 ;
 472
 473 : check-ensure ( old-input result -- result )
 474     [ ignore <parse-result> ] [ drop f ] if ;
 475
 476 M: ensure-parser (compile) ( peg -- quot )
 477     p1>> compile-parser-quot '[ input-slice @ check-ensure ] ;
 478
 479 TUPLE: ensure-not-parser p1 ;
 480
 481 : check-ensure-not ( old-input result -- result )
 482     [ drop f ] [ ignore <parse-result> ] if ;
 483
 484 M: ensure-not-parser (compile) ( peg -- quot )
 485     p1>> compile-parser-quot '[ input-slice @ check-ensure-not ] ;
 486
 487 TUPLE: action-parser p1 quot ;
 488
 489 : check-action ( result quot -- result )
 490     over [
 491         over ast>> swap call( ast -- ast ) >>ast
 492     ] [
 493         drop
 494     ] if ;
 495
 496 M: action-parser (compile) ( peg -- quot )
 497     [ p1>> compile-parser-quot ] [ quot>> ] bi '[ @ _ check-action ] ;
 498
 499 TUPLE: sp-parser p1 ;
 500
 501 M: sp-parser (compile) ( peg -- quot )
 502     p1>> compile-parser-quot '[
 503         input-slice [ blank? ] trim-head-slice input-from pos set @
 504     ] ;
 505
 506 TUPLE: delay-parser quot ;
 507
 508 M: delay-parser (compile) ( peg -- quot )
 509     #! For efficiency we memoize the quotation.
 510     #! This way it is run only once and the
 511     #! parser constructed once at run time.
 512     quot>> gensym [ delayed get set-at ] keep 1quotation ;
 513
 514 TUPLE: box-parser quot ;
 515
 516 M: box-parser (compile) ( peg -- quot )
 517     #! Calls the quotation at compile time
 518     #! to produce the parser to be compiled.
 519     #! This differs from 'delay' which calls
 520     #! it at run time.
 521     quot>> call( -- parser ) compile-parser-quot ;
 522
 523 PRIVATE>
 524
 525 : token ( string -- parser )
 526     token-parser boa wrap-peg ;
 527
 528 : satisfy ( quot -- parser )
 529     satisfy-parser boa wrap-peg ;
 530
 531 : range ( min max -- parser )
 532     range-parser boa wrap-peg ;
 533
 534 : seq ( seq -- parser )
 535     seq-parser boa wrap-peg ;
 536
 537 : 2seq ( parser1 parser2 -- parser )
 538     2array seq ;
 539
 540 : 3seq ( parser1 parser2 parser3 -- parser )
 541     3array seq ;
 542
 543 : 4seq ( parser1 parser2 parser3 parser4 -- parser )
 544     4array seq ;
 545
 546 : seq* ( quot -- paser )
 547     { } make seq ; inline
 548
 549 : choice ( seq -- parser )
 550     choice-parser boa wrap-peg ;
 551
 552 : 2choice ( parser1 parser2 -- parser )
 553     2array choice ;
 554
 555 : 3choice ( parser1 parser2 parser3 -- parser )
 556     3array choice ;
 557
 558 : 4choice ( parser1 parser2 parser3 parser4 -- parser )
 559     4array choice ;
 560
 561 : choice* ( quot -- paser )
 562     { } make choice ; inline
 563
 564 : repeat0 ( parser -- parser )
 565     repeat0-parser boa wrap-peg ;
 566
 567 : repeat1 ( parser -- parser )
 568     repeat1-parser boa wrap-peg ;
 569
 570 : optional ( parser -- parser )
 571     optional-parser boa wrap-peg ;
 572
 573 : semantic ( parser quot -- parser )
 574     semantic-parser boa wrap-peg ;
 575
 576 : ensure ( parser -- parser )
 577     ensure-parser boa wrap-peg ;
 578
 579 : ensure-not ( parser -- parser )
 580     ensure-not-parser boa wrap-peg ;
 581
 582 : action ( parser quot -- parser )
 583     action-parser boa wrap-peg ;
 584
 585 : sp ( parser -- parser )
 586     sp-parser boa wrap-peg ;
 587
 588 : hide ( parser -- parser )
 589     [ drop ignore ] action ;
 590
 591 : delay ( quot -- parser )
 592     delay-parser boa wrap-peg ;
 593
 594 : box ( quot -- parser )
 595     #! because a box has its quotation run at compile time
 596     #! it must always have a new parser wrapper created,
 597     #! not a cached one. This is because the same box,
 598     #! compiled twice can have a different compiled word
 599     #! due to running at compile time.
 600     #! Why the [ ] action at the end? Box parsers don't get
 601     #! memoized during parsing due to all box parsers being
 602     #! unique. This breaks left recursion detection during the
 603     #! parse. The action adds an indirection with a parser type
 604     #! that gets memoized and fixes this. Need to rethink how
 605     #! to fix boxes so this isn't needed...
 606     box-parser boa f next-id parser boa [ ] action ;
 607
 608 ERROR: parse-failed input word ;
 609
 610 SYNTAX: PEG:
 611     [let
 612         (:) :> ( word def effect )
 613         [
 614           [
 615             def call compile :> compiled-def
 616             [
 617               dup compiled-def compiled-parse
 618               [ ast>> ] [ word parse-failed ] ?if
 619             ]
 620             word swap effect define-declared
 621           ] with-compilation-unit
 622         ] append!
 623     ] ;
 624
 625 USE: vocabs.loader
 626
 627 { "debugger" "peg" } "peg.debugger" require-when