basis/peg/peg.factor

   1 ! Copyright (C) 2007, 2008 Chris Double.
   2 ! See https://factorcode.org/license.txt for BSD license.
   3
   4 USING: accessors arrays assocs classes combinators
   5 combinators.short-circuit compiler.units effects.parser kernel
   6 literals make math math.order memoize namespaces quotations
   7 sequences sets splitting strings unicode vectors vocabs.loader
   8 words ;
   9
  10 IN: peg
  11
  12 TUPLE: parse-result remaining ast ;
  13 TUPLE: parse-error position got messages ;
  14 TUPLE: parser peg compiled id ;
  15
  16 M: parser equal? { [ [ class-of ] same? ] [ [ id>> ] same? ] } 2&& ;
  17 M: parser hashcode* id>> hashcode* ;
  18
  19 C: <parse-result> parse-result
  20 C: <parse-error>  parse-error
  21
  22 GENERIC: parser-quot ( peg -- quot )
  23
  24 SYMBOL: ignore
  25 SYMBOL: fail
  26
  27 <PRIVATE
  28
  29 SYMBOL: error-stack
  30
  31 : merge-overlapping-errors ( a b -- c )
  32     dupd [ messages>> ] bi@ union [ [ position>> ] [ got>> ] bi ] dip
  33     <parse-error> ;
  34
  35 : (merge-errors) ( a b -- c )
  36     {
  37         { [ over position>> not ] [ nip ] }
  38         { [ dup  position>> not ] [ drop ] }
  39         [
  40             2dup [ position>> ] compare {
  41                 { +lt+ [ nip ] }
  42                 { +gt+ [ drop ] }
  43                 { +eq+ [ merge-overlapping-errors ] }
  44             } case
  45         ]
  46     } cond ;
  47
  48 : merge-errors ( -- )
  49     error-stack get dup length 1 > [
  50         [ pop ] [ pop swap (merge-errors) ] [ ] tri push
  51     ] [
  52         drop
  53     ] if ;
  54
  55 : add-error ( position got message -- )
  56     <parse-error> error-stack get push ;
  57
  58 : packrat ( id -- cache )
  59     ! The packrat cache is a mapping of parser-id->cache.
  60     ! For each parser it maps to a cache holding a mapping
  61     ! of position->result. The packrat cache therefore keeps
  62     ! track of all parses that have occurred at each position
  63     ! of the input string and the results obtained from that
  64     ! parser.
  65     \ packrat get [ drop H{ } clone ] cache ;
  66
  67 SYMBOL: pos
  68 SYMBOL: input
  69 SYMBOL: lrstack
  70
  71 : heads ( -- cache )
  72     ! A mapping from position->peg-head. It maps a
  73     ! position in the input string being parsed to
  74     ! the head of the left recursion which is currently
  75     ! being grown. It is 'f' at any position where
  76     ! left recursion growth is not underway.
  77     \ heads get ;
  78
  79 : peg-cache ( -- cache )
  80     ! Holds a hashtable mapping a peg tuple to
  81     ! the parser tuple for that peg. The parser tuple
  82     ! holds a unique id and the compiled form of that peg.
  83     \ peg-cache get-global [
  84         H{ } clone dup \ peg-cache set-global
  85     ] unless* ;
  86
  87 : reset-pegs ( -- )
  88     H{ } clone \ peg-cache set-global ;
  89
  90 reset-pegs
  91
  92 : next-id ( -- n )
  93     ! Return the next unique id for a parser
  94     \ next-id counter ;
  95
  96 : wrap-peg ( peg -- parser )
  97     ! Wrap a parser tuple around the peg object.
  98     ! Look for an existing parser tuple for that
  99     ! peg object.
 100     peg-cache [ f next-id parser boa ] cache ;
 101
 102 ! An entry in the table of memoized parse results
 103 ! ast = an AST produced from the parse
 104 !       or the symbol 'fail'
 105 !       or a left-recursion object
 106 ! pos = the position in the input string of this entry
 107 TUPLE: memo-entry ans pos ;
 108
 109 TUPLE: left-recursion seed rule-id head next ;
 110
 111 TUPLE: peg-head rule-id involved-set eval-set ;
 112
 113 : rule-id ( word -- id )
 114     ! A rule is the parser compiled down to a word. It has
 115     ! a "peg-id" property containing the id of the original parser.
 116     "peg-id" word-prop ;
 117
 118 : input-slice ( -- slice )
 119     ! Return a slice of the input from the current parse position
 120     input get pos get tail-slice ;
 121
 122 : input-from ( input -- n )
 123     ! Return the index from the original string that the
 124     ! input slice is based on.
 125     dup slice? [ from>> ] [ drop 0 ] if ;
 126
 127 : process-rule-result ( p result -- result )
 128     [
 129         nip [ ast>> ] [ remaining>> ] bi input-from pos namespaces:set
 130     ] [
 131         pos namespaces:set fail
 132     ] if* ;
 133
 134 : eval-rule ( rule -- ast )
 135     ! Evaluate a rule, return an ast resulting from it.
 136     ! Return fail if the rule failed. The rule has
 137     ! stack effect ( -- parse-result )
 138     pos get swap execute( -- parse-result ) process-rule-result ; inline
 139
 140 : memo ( pos id -- memo-entry )
 141     ! Return the result from the memo cache.
 142     packrat at ;
 143
 144 : set-memo ( memo-entry pos id -- )
 145     ! Store an entry in the cache
 146     packrat set-at ;
 147
 148 : update-m ( ast m -- )
 149     swap >>ans pos get >>pos drop ;
 150
 151 : stop-growth? ( ast m -- ? )
 152     [ fail = pos get ] dip pos>> <= or ;
 153
 154 : setup-growth ( h p -- )
 155     pos namespaces:set dup involved-set>> clone >>eval-set drop ;
 156
 157 : (grow-lr) ( h p r: ( -- result ) m -- )
 158     [ [ setup-growth ] 2keep ] 2dip
 159     [ dup eval-rule ] dip swap
 160         dup pick stop-growth? [
 161         5drop
 162     ] [
 163         over update-m
 164         (grow-lr)
 165     ] if ; inline recursive
 166
 167 : grow-lr ( h p r m -- ast )
 168     [ [ heads set-at ] 2keep ] 2dip
 169     pick over [ (grow-lr) ] 2dip
 170     swap heads delete-at
 171     dup pos>> pos namespaces:set ans>>
 172     ; inline
 173
 174 :: (setup-lr) ( l s -- )
 175     s [
 176         s left-recursion? [ s throw ] unless
 177         s head>> l head>> eq? [
 178             l head>> s head<<
 179             l head>> [ s rule-id>> suffix ] change-involved-set drop
 180             l s next>> (setup-lr)
 181         ] unless
 182     ] when ;
 183
 184 :: setup-lr ( r l -- )
 185     l head>> [
 186         r rule-id V{ } clone V{ } clone peg-head boa l head<<
 187     ] unless
 188     l lrstack get (setup-lr) ;
 189
 190 :: lr-answer ( r p m -- ast )
 191     m ans>> head>> :> h
 192     h rule-id>> r rule-id eq? [
 193         m ans>> seed>> m ans<<
 194         m ans>> fail = [
 195             fail
 196         ] [
 197             h p r m grow-lr
 198         ] if
 199     ] [
 200         m ans>> seed>>
 201     ] if ; inline
 202
 203 :: recall ( r p -- memo-entry )
 204     p r rule-id memo :> m
 205     p heads at :> h
 206     h [
 207         m r rule-id h involved-set>> h rule-id>> suffix member? not and [
 208             fail p memo-entry boa
 209         ] [
 210             r rule-id h eval-set>> member? [
 211                 h [ r rule-id swap remove ] change-eval-set drop
 212                 r eval-rule
 213                 m update-m
 214                 m
 215             ] [
 216                 m
 217             ] if
 218         ] if
 219     ] [
 220         m
 221     ] if ; inline
 222
 223 :: apply-non-memo-rule ( r p -- ast )
 224     fail r rule-id f lrstack get left-recursion boa :> lr
 225     lr lrstack namespaces:set lr p memo-entry boa dup p r rule-id set-memo :> m
 226     r eval-rule :> ans
 227     lrstack get next>> lrstack namespaces:set
 228     pos get m pos<<
 229     lr head>> [
 230         m ans>> left-recursion? [
 231             ans lr seed<<
 232             r p m lr-answer
 233         ] [ ans ] if
 234     ] [
 235         ans m ans<<
 236         ans
 237     ] if ; inline
 238
 239 : apply-memo-rule ( r m -- ast )
 240     [ ans>> ] [ pos>> ] bi pos namespaces:set
 241     dup left-recursion? [
 242         [ setup-lr ] keep seed>>
 243     ] [
 244         nip
 245     ] if ;
 246
 247 : apply-rule ( r p -- ast )
 248     2dup recall [
 249         nip apply-memo-rule
 250     ] [
 251         apply-non-memo-rule
 252     ] if* ; inline
 253
 254 : with-packrat ( input quot -- result )
 255     ! Run the quotation with a packrat cache active.
 256     [
 257         swap input ,,
 258         0 pos ,,
 259         f lrstack ,,
 260         V{ } clone error-stack ,,
 261         H{ } clone \ heads ,,
 262         H{ } clone \ packrat ,,
 263     ] H{ } make swap with-variables ; inline
 264
 265 : process-parser-result ( result -- result )
 266     dup fail = [
 267         drop f
 268     ] [
 269         input-slice swap <parse-result>
 270     ] if ;
 271
 272 : execute-parser ( word -- result )
 273     pos get apply-rule process-parser-result ;
 274
 275 : preset-parser-word ( parser -- word parser )
 276     gensym tuck >>compiled ;
 277
 278 : define-parser-word ( word parser -- )
 279     ! Return the body of the word that is the compiled version
 280     ! of the parser.
 281     [ peg>> parser-quot ( -- result ) define-declared ]
 282     [ id>> "peg-id" set-word-prop ] 2bi ;
 283
 284 : compile-parser-word ( parser -- word )
 285     ! Look to see if the given parser has been compiled.
 286     ! If not, compile it to a temporary word, cache it,
 287     ! and return it. Otherwise return the existing one.
 288     ! Circular parsers are supported by getting the word
 289     ! name and storing it in the cache, before compiling,
 290     ! so it is picked up when re-entered.
 291     dup compiled>> [
 292         nip
 293     ] [
 294         preset-parser-word dupd define-parser-word
 295     ] if* ;
 296
 297 : execute-parser-quot ( parser -- quot )
 298     compile-parser-word '[ _ execute-parser ] ;
 299
 300 : execute-parsers-quots ( parsers -- quots )
 301     [ execute-parser-quot ] map dup rest-slice
 302     [ '[ @ merge-errors ] ] map! drop ;
 303
 304 SYMBOL: delayed
 305
 306 : fixup-delayed ( -- )
 307     ! Work through all delayed parsers and recompile their
 308     ! words to have the correct bodies.
 309     delayed get [
 310         call( -- parser ) execute-parser-quot ( -- result ) define-declared
 311     ] assoc-each ;
 312
 313 : compile-parser ( parser -- word )
 314     [
 315         H{ } clone delayed [
 316             compile-parser-word fixup-delayed
 317         ] with-variable
 318     ] with-compilation-unit ;
 319
 320 : perform-parse ( input word -- result )
 321     swap [
 322         execute-parser [
 323             error-stack get ?first [ throw ] [
 324                 pos get input get f <parse-error> throw
 325             ] if*
 326         ] unless*
 327     ] with-packrat ;
 328
 329 PRIVATE>
 330
 331 : (parse) ( input parser -- result )
 332     compile-parser perform-parse ;
 333
 334 : parse ( input parser -- ast )
 335     (parse) ast>> ;
 336
 337 ERROR: unable-to-fully-parse remaining ;
 338
 339 ERROR: could-not-parse ;
 340
 341 : check-parse-result ( result -- result )
 342     [
 343         dup remaining>> [ blank? ] trim [
 344             unable-to-fully-parse
 345         ] unless-empty
 346     ] [
 347         could-not-parse
 348     ] if* ;
 349
 350 : parse-fully ( input parser -- ast )
 351     (parse) check-parse-result ast>> ;
 352
 353 <PRIVATE
 354
 355 TUPLE: token-parser symbol ;
 356
 357 : parse-token ( input string -- result )
 358     ! Parse the string, returning a parse result
 359     [ ?head-slice ] keep swap [
 360         <parse-result>
 361     ] [
 362         [ seq>> pos get swap ] dip "'" "'" surround 1vector add-error f
 363     ] if ;
 364
 365 M: token-parser parser-quot
 366     symbol>> '[ input-slice _ parse-token ] ;
 367
 368 TUPLE: satisfy-parser quot ;
 369
 370 :: parse-satisfy ( input quot -- result/f )
 371     input [ f ] [
 372         unclip-slice dup quot call [
 373             <parse-result>
 374         ] [
 375             2drop f
 376         ] if
 377     ] if-empty ; inline
 378
 379 M: satisfy-parser parser-quot
 380     quot>> '[ input-slice _ parse-satisfy ] ;
 381
 382 TUPLE: range-parser min max ;
 383
 384 :: parse-range ( input min max -- result/f )
 385     input [ f ] [
 386         dup first min max between? [
 387             unclip-slice <parse-result>
 388         ] [
 389             drop f
 390         ] if
 391     ] if-empty ;
 392
 393 M: range-parser parser-quot
 394     [ min>> ] [ max>> ] bi '[ input-slice _ _ parse-range ] ;
 395
 396 TUPLE: seq-parser parsers ;
 397
 398 : calc-seq-result ( prev-result current-result -- next-result )
 399     [
 400         [ remaining>> >>remaining ] [ ast>> ] bi
 401         dup ignore = [ drop ] [ over ast>> push ] if
 402     ] [
 403         drop f
 404     ] if* ;
 405
 406 : parse-seq-element ( result quot -- result )
 407     '[ @ calc-seq-result ] [ f ] if* ; inline
 408
 409 M: seq-parser parser-quot
 410     parsers>> execute-parsers-quots
 411     [ '[ _ parse-seq-element ] ] map
 412     '[ input-slice V{ } clone <parse-result> _ 1&& ] ;
 413
 414 TUPLE: choice-parser parsers ;
 415
 416 M: choice-parser parser-quot
 417     parsers>> execute-parsers-quots '[ _ 0|| ] ;
 418
 419 TUPLE: repeat0-parser parser ;
 420
 421 : repeat-loop ( quot: ( -- result/f ) result -- result )
 422     over call [
 423         [ remaining>> >>remaining ] [ ast>> ] bi
 424         over ast>> push repeat-loop
 425     ] [
 426         nip
 427     ] if* ; inline recursive
 428
 429 M: repeat0-parser parser-quot
 430     parser>> execute-parser-quot '[
 431         input-slice V{ } clone <parse-result> _ swap repeat-loop
 432     ] ;
 433
 434 TUPLE: repeat1-parser parser ;
 435
 436 : repeat1-empty-check ( result -- result )
 437     [ dup ast>> empty? [ drop f ] when ] [ f ] if* ;
 438
 439 M: repeat1-parser parser-quot
 440     parser>> execute-parser-quot '[
 441         input-slice V{ } clone <parse-result> _ swap repeat-loop
 442         repeat1-empty-check
 443     ] ;
 444
 445 TUPLE: optional-parser parser ;
 446
 447 : check-optional ( result -- result )
 448     [ input-slice f <parse-result> ] unless* ;
 449
 450 M: optional-parser parser-quot
 451     parser>> execute-parser-quot '[ @ check-optional ] ;
 452
 453 TUPLE: semantic-parser parser quot ;
 454
 455 : check-semantic ( result quot -- result )
 456     dupd '[ dup ast>> @ [ drop f ] unless ] when ; inline
 457
 458 M: semantic-parser parser-quot
 459     [ parser>> execute-parser-quot ] [ quot>> ] bi
 460     '[ @ _ check-semantic ] ;
 461
 462 TUPLE: ensure-parser parser ;
 463
 464 : check-ensure ( old-input result -- result )
 465     [ ignore <parse-result> ] [ drop f ] if ;
 466
 467 M: ensure-parser parser-quot
 468     parser>> execute-parser-quot '[ input-slice @ check-ensure ] ;
 469
 470 TUPLE: ensure-not-parser parser ;
 471
 472 : check-ensure-not ( old-input result -- result )
 473     [ drop f ] [ ignore <parse-result> ] if ;
 474
 475 M: ensure-not-parser parser-quot
 476     parser>> execute-parser-quot '[ input-slice @ check-ensure-not ] ;
 477
 478 TUPLE: action-parser parser quot ;
 479
 480 : check-action ( result quot -- result )
 481     dupd '[ [ _ call( ast -- ast ) ] change-ast ] when ;
 482
 483 M: action-parser parser-quot
 484     [ parser>> execute-parser-quot ] [ quot>> ] bi
 485     '[ @ _ check-action ] ;
 486
 487 TUPLE: sp-parser parser ;
 488
 489 M: sp-parser parser-quot
 490     parser>> execute-parser-quot '[
 491         input-slice [ blank? ] trim-head-slice input-from pos namespaces:set @
 492     ] ;
 493
 494 TUPLE: delay-parser quot ;
 495
 496 M: delay-parser parser-quot
 497     ! For efficiency we memoize the quotation.
 498     ! This way it is run only once and the
 499     ! parser constructed once at run time.
 500     quot>> gensym [ delayed get set-at ] keep 1quotation ;
 501
 502 TUPLE: box-parser quot ;
 503
 504 M: box-parser parser-quot
 505     ! Calls the quotation at compile time
 506     ! to produce the parser to be compiled.
 507     ! This differs from 'delay' which calls
 508     ! it at run time.
 509     quot>> call( -- parser ) execute-parser-quot ;
 510
 511 PRIVATE>
 512
 513 : token ( string -- parser )
 514     token-parser boa wrap-peg ;
 515
 516 : satisfy ( quot -- parser )
 517     satisfy-parser boa wrap-peg ;
 518
 519 : range ( min max -- parser )
 520     range-parser boa wrap-peg ;
 521
 522 : seq ( seq -- parser )
 523     seq-parser boa wrap-peg ;
 524
 525 : 2seq ( parser1 parser2 -- parser )
 526     2array seq ;
 527
 528 : 3seq ( parser1 parser2 parser3 -- parser )
 529     3array seq ;
 530
 531 : 4seq ( parser1 parser2 parser3 parser4 -- parser )
 532     4array seq ;
 533
 534 : seq* ( quot -- parser )
 535     { } make seq ; inline
 536
 537 : choice ( seq -- parser )
 538     choice-parser boa wrap-peg ;
 539
 540 : 2choice ( parser1 parser2 -- parser )
 541     2array choice ;
 542
 543 : 3choice ( parser1 parser2 parser3 -- parser )
 544     3array choice ;
 545
 546 : 4choice ( parser1 parser2 parser3 parser4 -- parser )
 547     4array choice ;
 548
 549 : choice* ( quot -- parser )
 550     { } make choice ; inline
 551
 552 : repeat0 ( parser -- parser )
 553     repeat0-parser boa wrap-peg ;
 554
 555 : repeat1 ( parser -- parser )
 556     repeat1-parser boa wrap-peg ;
 557
 558 : optional ( parser -- parser )
 559     optional-parser boa wrap-peg ;
 560
 561 : semantic ( parser quot -- parser )
 562     semantic-parser boa wrap-peg ;
 563
 564 : ensure ( parser -- parser )
 565     ensure-parser boa wrap-peg ;
 566
 567 : ensure-not ( parser -- parser )
 568     ensure-not-parser boa wrap-peg ;
 569
 570 : action ( parser quot -- parser )
 571     action-parser boa wrap-peg ;
 572
 573 : sp ( parser -- parser )
 574     sp-parser boa wrap-peg ;
 575
 576 : hide ( parser -- parser )
 577     [ drop ignore ] action ;
 578
 579 : delay ( quot -- parser )
 580     delay-parser boa wrap-peg ;
 581
 582 : box ( quot -- parser )
 583     ! because a box has its quotation run at compile time
 584     ! it must always have a new parser wrapper created,
 585     ! not a cached one. This is because the same box,
 586     ! compiled twice can have a different compiled word
 587     ! due to running at compile time.
 588     ! Why the [ ] action at the end? Box parsers don't get
 589     ! memoized during parsing due to all box parsers being
 590     ! unique. This breaks left recursion detection during the
 591     ! parse. The action adds an indirection with a parser type
 592     ! that gets memoized and fixes this. Need to rethink how
 593     ! to fix boxes so this isn't needed...
 594     box-parser boa f next-id parser boa [ ] action ;
 595
 596 SYNTAX: PARTIAL-PEG:
 597     (:) '[
 598         [
 599             _
 600             _ call( -- parser ) compile-parser
 601             '[ _ perform-parse ast>> ]
 602             _ define-declared
 603         ] with-compilation-unit
 604     ] append! ;
 605
 606 SYNTAX: PEG:
 607     (:) '[
 608         [
 609             _
 610             _ call( -- parser ) compile-parser
 611             '[ _ perform-parse check-parse-result ast>> ]
 612             _ define-declared
 613         ] with-compilation-unit
 614     ] append! ;
 615
 616 { "debugger" "peg" } "peg.debugger" require-when