basis/peg/ebnf/ebnf-docs.factor

   1 ! Copyright (C) 2009 Chris Double.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: help.syntax help.markup peg peg.search ;
   4 IN: peg.ebnf
   5
   6 HELP: <EBNF
   7 { $syntax "<EBNF ...ebnf... EBNF>" }
   8 { $values { "...ebnf..." "EBNF DSL text" } }
   9 { $description
  10     "Creates a " { $vocab-link "peg" }
  11     " object that parses a string using the syntax "
  12     "defined with the EBNF DSL. The peg object can be run using the " { $link parse }
  13     " word and can be used with the " { $link search } " and " { $link replace } " words."
  14 }
  15 { $examples
  16     { $example
  17        "USING: kernel prettyprint peg.ebnf peg.search ;"
  18        "\"abcdab\" <EBNF rule=\"a\" \"b\" => [[ drop \"foo\" ]] EBNF> replace ."
  19        "\"foocdfoo\""
  20     }
  21 } ;
  22
  23 HELP: [EBNF
  24 { $syntax "[EBNF ...ebnf... EBNF]" }
  25 { $values { "...ebnf..." "EBNF DSL text" } }
  26 { $description
  27     "Creates and calls a quotation that parses a string using the syntax "
  28     "defined with the EBNF DSL. The quotation has stack effect "
  29     { $snippet "( string -- ast )" } " where 'string' is the text to be parsed "
  30     "and 'ast' is the resulting abstract syntax tree. If the parsing fails the "
  31     "quotation throws an exception."
  32 }
  33 { $examples
  34     { $example
  35        "USING: prettyprint peg.ebnf ;"
  36        "\"ab\" [EBNF rule=\"a\" \"b\" EBNF] ."
  37        "V{ \"a\" \"b\" }"
  38     }
  39 } ;
  40
  41 HELP: EBNF:
  42 { $syntax "EBNF: word ...ebnf... ;EBNF" }
  43 { $values { "word" "a word" } { "...ebnf..." "EBNF DSL text" } }
  44 { $description
  45     "Defines a word that when called will parse a string using the syntax "
  46     "defined with the EBNF DSL. The word has stack effect "
  47     { $snippet "( string -- ast )" } " where 'string' is the text to be parsed "
  48     "and 'ast' is the resulting abstract syntax tree. If the parsing fails the "
  49     "word throws an exception."
  50 }
  51 { $examples
  52     { $example
  53        "USING: prettyprint peg.ebnf ;"
  54        "IN: scratchpad"
  55        "EBNF: foo rule=\"a\" \"b\" ;EBNF"
  56        "\"ab\" foo ."
  57        "V{ \"a\" \"b\" }"
  58     }
  59 } ;
  60
  61 ARTICLE: "peg.ebnf.strings" "Strings"
  62 "A string in a rule will match that sequence of characters from the input string. "
  63 "The AST result from the match is the string itself."
  64 { $examples
  65     { $example
  66        "USING: prettyprint peg.ebnf ;"
  67        "\"helloworld\" [EBNF rule=\"hello\" \"world\" EBNF] ."
  68        "V{ \"hello\" \"world\" }"
  69     }
  70 } ;
  71
  72 ARTICLE: "peg.ebnf.any" "Any"
  73 "A full stop character (.) will match any single token in the input string. "
  74 "The AST resulting from this is the token itself."
  75 { $examples
  76     { $example
  77        "USING: prettyprint peg.ebnf ;"
  78        "\"abc\" [EBNF rule=\"a\" . \"c\" EBNF] ."
  79        "V{ \"a\" 98 \"c\" }"
  80     }
  81 } ;
  82
  83 ARTICLE: "peg.ebnf.sequence" "Sequence"
  84 "Any white space separated rule element is considered a sequence. Each rule "
  85 "in the sequence is matched from the input stream, consuming the input as it "
  86 "goes. The AST result is a vector containing the results of each rule element in "
  87 "the sequence."
  88 { $examples
  89     { $example
  90        "USING: prettyprint peg.ebnf ;"
  91        "\"abbba\" [EBNF rule=\"a\" (\"b\")* \"a\" EBNF] ."
  92        "V{ \"a\" V{ \"b\" \"b\" \"b\" } \"a\" }"
  93     }
  94 }
  95 ;
  96
  97 ARTICLE: "peg.ebnf.grouping" "Group"
  98 "Any sequence of rules may be grouped using parentheses (" { $snippet "()" } "). "
  99 "The parenthesized sequence can then be modified as a group. Parentheses also "
 100 "delimit sets of choices separated by pipe (|) characters."
 101 $nl
 102 "A group can also be delimited with curly braces (" { $snippet "{}" } "), in "
 103 "which case an implicit optional whitespace-matching rule will be inserted between "
 104 "rules sequenced within the braces."
 105 { $examples
 106     { $example
 107        "USING: prettyprint peg.ebnf ;"
 108        "\"abcca\" [EBNF rule=\"a\" (\"b\" | \"c\")* \"a\" EBNF] ."
 109        "V{ \"a\" V{ \"b\" \"c\" \"c\" } \"a\" }"
 110     }
 111     { $example
 112        "USING: prettyprint peg.ebnf ;"
 113        "\"ab  c\nd \" [EBNF rule={\"a\" \"b\" \"c\" \"d\"} EBNF] ."
 114        "V{ \"a\" \"b\" \"c\" \"d\" }"
 115     }
 116 }
 117 ;
 118
 119 ARTICLE: "peg.ebnf.choice" "Choice"
 120 "Any rule element separated by a pipe character (|) is considered a choice. Choices "
 121 "are matched against the input stream in order. If a match succeeds then the remaining "
 122 "choices are discarded and the result of the match is the AST result of the choice."
 123 { $examples
 124     { $example
 125        "USING: prettyprint peg.ebnf ;"
 126        "\"a\" [EBNF rule=\"a\" | \"b\" | \"c\" EBNF] ."
 127        "\"a\""
 128     }
 129     { $example
 130        "USING: prettyprint peg.ebnf ;"
 131        "\"b\" [EBNF rule=\"a\" | \"b\" | \"c\" EBNF] ."
 132        "\"b\""
 133     }
 134     { $example
 135        "USING: prettyprint peg.ebnf ;"
 136        "\"d\" [EBNF rule=\"a\" | \"b\" | \"c\" EBNF] ."
 137        "Peg parsing error at character position 0.\nExpected token 'c' or token 'b' or token 'a'"
 138     }
 139 }
 140 ;
 141
 142 ARTICLE: "peg.ebnf.ignore" "Ignore"
 143 "Any rule element followed by a tilde (~) will be matched, and its results "
 144 "discarded from the AST."
 145 { $examples
 146     { $example
 147        "USING: prettyprint peg.ebnf ;"
 148        "\"abc\" [EBNF rule=\"a\" \"b\"~ \"c\" EBNF] ."
 149        "V{ \"a\" \"c\" }"
 150     }
 151 }
 152 ;
 153
 154 ARTICLE: "peg.ebnf.option" "Option"
 155 "Any rule element followed by a question mark (?) is considered optional. The "
 156 "rule is tested against the input. If it succeeds the result is stored in the AST. "
 157 "If it fails then the parse still succeeds and false (f) is stored in the AST."
 158 { $examples
 159     { $example
 160        "USING: prettyprint peg.ebnf ;"
 161        "\"abc\" [EBNF rule=\"a\" \"b\"? \"c\" EBNF] ."
 162        "V{ \"a\" \"b\" \"c\" }"
 163     }
 164     { $example
 165        "USING: prettyprint peg.ebnf ;"
 166        "\"ac\" [EBNF rule=\"a\" \"b\"? \"c\" EBNF] ."
 167        "V{ \"a\" f \"c\" }"
 168     }
 169 }
 170 ;
 171
 172 ARTICLE: "peg.ebnf.character-class" "Character Class"
 173 "Character class matching can be done using a range of characters defined in "
 174 "square brackets. Multiple ranges can be included in a single character class "
 175 "definition. The syntax for the range is a start character, followed by a minus "
 176 "(-) followed by an end character. For example " { $snippet "[a-zA-Z]" } ". "
 177 "The AST resulting from the match is an integer of the character code for the "
 178 "character that matched."
 179 { $examples
 180     { $example
 181        "USING: prettyprint peg.ebnf ;"
 182        "\"123\" [EBNF rule=[0-9]+ EBNF] ."
 183        "V{ 49 50 51 }"
 184     }
 185 }
 186 ;
 187
 188 ARTICLE: "peg.ebnf.one-or-more" "One or more"
 189 "Any rule element followed by a plus (+) matches one or more instances of the rule "
 190 "from the input string. The AST result is the vector of the AST results from "
 191 "the matched rule."
 192 { $examples
 193     { $example
 194        "USING: prettyprint peg.ebnf ;"
 195        "\"aab\" [EBNF rule=\"a\"+ \"b\" EBNF] ."
 196        "V{ V{ \"a\" \"a\" } \"b\" }"
 197     }
 198 }
 199 ;
 200
 201 ARTICLE: "peg.ebnf.zero-or-more" "Zero or more"
 202 "Any rule element followed by an asterisk (*) matches zero or more instances of the rule "
 203 "from the input string. The AST result is the vector of the AST results from "
 204 "the matched rule. This will be empty if there are no matches."
 205 { $examples
 206     { $example
 207        "USING: prettyprint peg.ebnf ;"
 208        "\"aab\" [EBNF rule=\"a\"* \"b\" EBNF] ."
 209        "V{ V{ \"a\" \"a\" } \"b\" }"
 210     }
 211     { $example
 212        "USING: prettyprint peg.ebnf ;"
 213        "\"b\" [EBNF rule=\"a\"* \"b\" EBNF] ."
 214        "V{ V{ } \"b\" }"
 215     }
 216 }
 217 ;
 218
 219 ARTICLE: "peg.ebnf.and" "And"
 220 "Any rule element prefixed by an ampersand (&) performs the Parsing Expression "
 221 "Grammar 'And Predicate' match. It attempts to match the rule against the input "
 222 "string. It will cause the parse to succeed or fail depending on if the rule "
 223 "succeeds or fails. It will not consume anything from the input string however and "
 224 "does not leave any result in the AST. This can be used for lookahead and "
 225 "disambiguation in choices."
 226 { $examples
 227     { $example
 228        "USING: prettyprint peg.ebnf ;"
 229        "\"ab\" [EBNF rule=&(\"a\") \"a\" \"b\" EBNF] ."
 230        "V{ \"a\" \"b\" }"
 231     }
 232 }
 233 ;
 234
 235 ARTICLE: "peg.ebnf.not" "Not"
 236 "Any rule element prefixed by an exclamation mark (!) performs the Parsing Expression "
 237 "Grammar 'Not Predicate' match. It attempts to match the rule against the input "
 238 "string. It will cause the parse to succeed if the rule match fails, and to fail "
 239 "if the rule match succeeds. It will not consume anything from the input string "
 240 "however and does not leave any result in the AST. This can be used for lookahead and "
 241 "disambiguation in choices."
 242 { $examples
 243     { $example
 244        "USING: prettyprint peg.ebnf ;"
 245        "\"<abcd>\" [EBNF rule=\"<\" (!(\">\") .)* \">\" EBNF] ."
 246        "V{ \"<\" V{ 97 98 99 100 } \">\" }"
 247     }
 248 }
 249 ;
 250
 251 ARTICLE: "peg.ebnf.action" "Action"
 252 "An action is a quotation that is run after a rule matches. The quotation "
 253 "consumes the AST of the rule match and leaves a new AST as the result. "
 254 "The stack effect of the action can be " { $snippet "( ast -- ast )" } " or "
 255 { $snippet "( -- ast )" } ". "
 256 "If it is the latter then the original AST is implicitly dropped and will be "
 257 "replaced by the AST left on the stack. This is mostly useful if variables are "
 258 "used in the rule since they can be referenced like locals in the action quotation. "
 259 "The action is defined by having a ' => ' at the end of a rule and "
 260 "using '[[' and ']]' to open and close the quotation. "
 261 "If an action leaves the object 'ignore' on the stack then the result of that "
 262 "action will not be put in the AST of the result."
 263 { $examples
 264     { $example
 265        "USING: prettyprint peg.ebnf strings ;"
 266        "\"<abcd>\" [EBNF rule=\"<\" ((!(\">\") .)* => [[ >string ]]) \">\" EBNF] ."
 267        "V{ \"<\" \"abcd\" \">\" }"
 268     }
 269     { $example
 270        "USING: prettyprint peg.ebnf math.parser ;"
 271        "\"123\" [EBNF rule=[0-9]+ => [[ string>number ]] EBNF] ."
 272        "123"
 273     }
 274 }
 275 ;
 276
 277 ARTICLE: "peg.ebnf.semantic-action" "Semantic Action"
 278 "Semantic actions allow providing a quotation that gets run on the AST of a "
 279 "matched rule that returns success or failure. The result of the parse is decided by "
 280 "the result of the semantic action. The stack effect for the quotation is "
 281 { $snippet ( ast -- ? ) } ". "
 282 "A semantic action follows the rule it applies to and is delimeted by '?[' and ']?'."
 283 { $examples
 284     { $example
 285        "USING: prettyprint peg.ebnf math math.parser ;"
 286        "\"1\" [EBNF rule=[0-9] ?[ digit> odd? ]? EBNF] ."
 287        "49"
 288     }
 289     { $example
 290        "USING: prettyprint peg.ebnf math math.parser ;"
 291        "\"2\" [EBNF rule=[0-9] ?[ digit> odd? ]? EBNF] ."
 292        "Sequence index out of bounds\nindex 0\nseq   V{ }"
 293     }
 294 }
 295 ;
 296
 297 ARTICLE: "peg.ebnf.variable" "Variable"
 298 "Variables names can be suffixed to a rule element using the colon character (:) "
 299 "followed by the variable name. These can then be used in rule actions to refer to "
 300 "the AST result of the rule element with that variable name."
 301 { $examples
 302     { $example
 303        "USING: prettyprint peg.ebnf math.parser ;"
 304        "\"1+2\" [EBNF rule=[0-9]:a \"+\" [0-9]:b => [[ a digit> b digit> + ]] EBNF] ."
 305        "3"
 306     }
 307 }
 308 ;
 309
 310 ARTICLE: "peg.ebnf.foreign-rules" "Foreign Rules"
 311 "Rules can call out to other peg.ebnf defined parsers. The result of "
 312 "the foreign call then becomes the AST of the successful parse. Foreign rules "
 313 "are invoked using '<foreign word-name>' or '<foreign word-name rule>'. The "
 314 "latter allows calling a specific rule in a previously designed peg.ebnf parser. "
 315 "If the 'word-name' is not the name of a peg.ebnf defined parser then it must be "
 316 "a word with stack effect " { $snippet "( -- parser )" } ". It must return a "
 317 { $vocab-link "peg" } " defined parser and it will be called to perform the parse "
 318 "for that rule."
 319 { $examples
 320     { $code
 321        "USING: prettyprint peg.ebnf ;"
 322        "EBNF: parse-string"
 323        "StringBody = (!('\"') .)*"
 324        "String= '\"' StringBody:b '\"' => [[ b >string ]]"
 325        ";EBNF"
 326        "EBNF: parse-two-strings"
 327        "TwoStrings = <foreign parse-string String> <foreign parse-string String>"
 328        ";EBNF"
 329        "EBNF: parse-two-strings"
 330        "TwoString = <foreign parse-string> <foreign parse-string>"
 331        ";EBNF"
 332     }
 333     { $code
 334        ": a-token ( -- parser ) \"a\" token ;"
 335        "EBNF: parse-abc"
 336        "abc = <foreign a-token> 'b' 'c'"
 337        ";EBNF"
 338    }
 339 }
 340 ;
 341
 342 ARTICLE: "peg.ebnf.tokenizers" "Tokenizers"
 343 "It is possible to override the tokenizer in an EBNF defined parser. "
 344 "Usually the input sequence to be parsed is an array of characters or a string. "
 345 "Terminals in a rule match successive characters in the array or string. "
 346 { $examples
 347     { $code
 348         "EBNF: foo"
 349         "rule = \"++\" \"--\""
 350         ";EBNF"
 351     }
 352 }
 353 "This parser when run with the string \"++--\" or the array "
 354 "{ CHAR: + CHAR: + CHAR: - CHAR: - } will succeed with an AST of { \"++\" \"--\" }. "
 355 "If you want to add whitespace handling to the grammar you need to put it "
 356 "between the terminals:"
 357 { $examples
 358     { $code
 359         "EBNF: foo"
 360         "space = (\" \" | \"\\r\" | \"\\t\" | \"\\n\")"
 361         "spaces = space* => [[ drop ignore ]]"
 362         "rule = spaces \"++\" spaces \"--\" spaces"
 363         ";EBNF"
 364     }
 365 }
 366 "In a large grammar this gets tedious and makes the grammar hard to read. "
 367 "Instead you can write a rule to split the input sequence into tokens, and "
 368 "have the grammar operate on these tokens. This is how the previous example "
 369 "might look:"
 370 { $examples
 371     { $code
 372         "EBNF: foo"
 373         "space = (\" \" | \"\\r\" | \"\\t\" | \"\\n\")"
 374         "spaces = space* => [[ drop ignore ]]"
 375         "tokenizer = spaces ( \"++\" | \"--\" )"
 376         "rule = \"++\" \"--\""
 377         ";EBNF"
 378      }
 379 }
 380 "'tokenizer' is the name of a built in rule. Once defined it is called to "
 381 "retrieve the next complete token from the input sequence. So the first part "
 382 "of 'rule' is to try and match \"++\". It calls the tokenizer to get the next "
 383 "complete token. This ignores spaces until it finds a \"++\" or \"--\". "
 384 "It is as if the input sequence for the parser was actually { \"++\" \"--\" } "
 385 "instead of the string \"++--\". With the new tokenizer \"....\" sequences "
 386 "in the grammar are matched for equality against the token, rather than a "
 387 "string comparison against successive items in the sequence. This can be used "
 388 "to match an AST from a tokenizer. "
 389 $nl
 390 "In this example I split the tokenizer into a separate parser and use "
 391 "'foreign' to call it from the main one. This allows testing of the "
 392 "tokenizer separately:"
 393 { $examples
 394     { $example
 395         "USING: prettyprint peg peg.ebnf kernel math.parser strings"
 396         "accessors math arrays ;"
 397         "IN: scratchpad"
 398         ""
 399         "TUPLE: ast-number value ;"
 400         "TUPLE: ast-string value ;"
 401         ""
 402         "EBNF: foo-tokenizer"
 403         "space = (\" \" | \"\\r\" | \"\\t\" | \"\\n\")"
 404         "spaces = space* => [[ drop ignore ]]"
 405         ""
 406         "number = [0-9]+ => [[ >string string>number ast-number boa ]]"
 407         "operator = (\"+\" | \"-\")"
 408         ""
 409         "token = spaces ( number | operator )"
 410         "tokens = token*"
 411         ";EBNF"
 412         ""
 413         "EBNF: foo"
 414         "tokenizer = <foreign foo-tokenizer token>"
 415         ""
 416         "number = . ?[ ast-number? ]? => [[ value>> ]]"
 417         "string = . ?[ ast-string? ]? => [[ value>> ]]"
 418         ""
 419         "rule = string:a number:b \"+\" number:c => [[ a b c + 2array ]]"
 420         ";EBNF"
 421         ""
 422         "\"123 456 +\" foo-tokenizer ."
 423         "V{\n    T{ ast-number { value 123 } }\n    T{ ast-number { value 456 } }\n    \"+\"\n}"
 424     }
 425 }
 426 "The '.' EBNF production means match a single object in the source sequence. "
 427 "Usually this is a character. With the replacement tokenizer it is either a "
 428 "number object, a string object or a string containing the operator. "
 429 "Using a tokenizer in language grammars makes it easier to deal with whitespace. "
 430 "Defining tokenizers in this way has the advantage of the tokenizer and parser "
 431 "working in one pass. There is no tokenization occurring over the whole string "
 432 "followed by the parse of that result. It tokenizes as it needs to. You can even "
 433 "switch tokenizers multiple times during a grammar. Rules use the tokenizer that "
 434 "was defined lexically before the rule. This is usefull in the JavaScript grammar:"
 435 { $examples
 436     { $code
 437         "EBNF: javascript"
 438         "tokenizer         = default"
 439         "nl                = \"\\r\" \"\\n\" | \"\\n\""
 440         "tokenizer         = <foreign tokenize-javascript Tok>"
 441         "..."
 442         "End                = !(.)"
 443         "Name               = . ?[ ast-name?   ]?   => [[ value>> ]] "
 444         "Number             = . ?[ ast-number? ]?   => [[ value>> ]]"
 445         "String             = . ?[ ast-string? ]?   => [[ value>> ]]"
 446         "RegExp             = . ?[ ast-regexp? ]?   => [[ value>> ]]"
 447         "SpacesNoNl         = (!(nl) Space)* => [[ ignore ]]"
 448         "Sc                 = SpacesNoNl (nl | &(\"}\") | End)| \";\""
 449     }
 450 }
 451 "Here the rule 'nl' is defined using the default tokenizer of sequential "
 452 "characters ('default' has the special meaning of the built in tokenizer). "
 453 "This is followed by using the JavaScript tokenizer for the remaining rules. "
 454 "This tokenizer strips out whitespace and newlines. Some rules in the grammar "
 455 "require checking for a newline. In particular the automatic semicolon insertion "
 456 "rule (managed by the 'Sc' rule here). If there is a newline, the semicolon can "
 457 "be optional in places. "
 458 { $examples
 459     { $code
 460       "\"do\" Stmt:s \"while\" \"(\" Expr:c \")\" Sc    => [[ s c ast-do-while boa ]]"
 461     }
 462 }
 463 "Even though the JavaScript tokenizer has removed the newlines, the 'nl' rule can "
 464 "be used to detect them since it is using the default tokenizer. This allows "
 465 "grammars to mix and match the tokenizer as required to make them more readable."
 466 ;
 467
 468 ARTICLE: "peg.ebnf" "EBNF"
 469 "The " { $vocab-link "peg.ebnf" } " vocabulary provides a DSL that allows writing PEG parsers that look like "
 470 "EBNF syntax. It provides three parsing words described below. These words all "
 471 "accept the same EBNF syntax. The difference is in how they are used. "
 472 { $subsections
 473     POSTPONE: <EBNF
 474     POSTPONE: [EBNF
 475     POSTPONE: EBNF:
 476 }
 477 "The EBNF syntax is composed of a series of rules of the form:"
 478 { $code
 479   "rule1 = ..."
 480   "rule2 = ..."
 481 }
 482 "The last defined rule is the main rule for the EBNF. It is the first one run "
 483 "and it is expected that the remaining rules are used by that rule. Rules may be "
 484 "left recursive. "
 485 "Each rule can contain the following:"
 486 { $subsections "peg.ebnf.strings"
 487 "peg.ebnf.any"
 488 "peg.ebnf.sequence"
 489 "peg.ebnf.grouping"
 490 "peg.ebnf.choice"
 491 "peg.ebnf.ignore"
 492 "peg.ebnf.option"
 493 "peg.ebnf.one-or-more"
 494 "peg.ebnf.zero-or-more"
 495 "peg.ebnf.and"
 496 "peg.ebnf.not"
 497 "peg.ebnf.character-class"
 498 "peg.ebnf.foreign-rules"
 499 "peg.ebnf.action"
 500 "peg.ebnf.semantic-action"
 501 "peg.ebnf.variable" }
 502 "Grammars defined in EBNF need to handle each character, or sequence of "
 503 "characters in the input. This can be tedious for dealing with whitespace in "
 504 "grammars that have 'tokens' separated by whitespace. You can define your "
 505 "own tokenizer that for an EBNF grammar, and write the grammar in terms of "
 506 "those tokens, allowing you to ignore the whitespace issue. The tokenizer "
 507 "can be changed at various parts in the grammar as needed. The JavaScript grammar "
 508 "does this to define the optional semicolon rule for example."
 509 { $subsections "peg.ebnf.tokenizers" }
 510 ;
 511
 512 ABOUT: "peg.ebnf"