! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
-! See http://factorcode.org/license.txt for BSD license.
-USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
-combinators regexp.classes strings splitting peg locals accessors
-regexp.ast ;
+! See https://factorcode.org/license.txt for BSD license.
+USING: accessors arrays assocs combinators
+combinators.short-circuit interval-maps kernel math.parser
+multiline peg.ebnf regexp.ast regexp.classes sequences sets
+splitting strings unicode unicode.data unicode.script ;
IN: regexp.parser
: allowed-char? ( ch -- ? )
ERROR: bad-class name ;
-: name>class ( name -- class )
+: simple ( str -- simple )
+ ! Alternatively, first collation key level?
+ >case-fold [ " \t_" member? ] reject ;
+
+: simple-table ( seq -- table )
+ [ [ simple ] keep ] H{ } map>assoc ;
+
+MEMO: simple-script-table ( -- table )
+ script-table interval-values members simple-table ;
+
+MEMO: simple-category-table ( -- table )
+ categories simple-table ;
+
+: parse-unicode-class ( name -- class )
{
- { "Lower" letter-class }
- { "Upper" LETTER-class }
- { "Alpha" Letter-class }
- { "ASCII" ascii-class }
- { "Digit" digit-class }
- { "Alnum" alpha-class }
- { "Punct" punctuation-class }
- { "Graph" java-printable-class }
- { "Print" java-printable-class }
- { "Blank" non-newline-blank-class }
- { "Cntrl" control-character-class }
- { "XDigit" hex-digit-class }
- { "Space" java-blank-class }
- ! TODO: unicode-character-class
- } [ bad-class ] at-error ;
+ { [ dup { [ length 1 = ] [ first "clmnpsz" member? ] } 1&& ] [
+ >upper first
+ <category-range-class>
+ ] }
+ { [ dup >title categories member? ] [
+ simple-category-table at <category-class>
+ ] }
+ { [ "script=" ?head ] [
+ [ simple-script-table at ]
+ [ <script-class> ]
+ [ "script=" prepend bad-class ] ?if
+ ] }
+ [ bad-class ]
+ } cond ;
+
+: unicode-class ( name -- class )
+ [ parse-unicode-class ] [ bad-class ] ?unless ;
+
+: name>class ( name -- class )
+ >string simple {
+ { "lower" letter-class }
+ { "upper" LETTER-class }
+ { "alpha" Letter-class }
+ { "ascii" ascii-class }
+ { "digit" digit-class }
+ { "alnum" alpha-class }
+ { "punct" punctuation-class }
+ { "graph" java-printable-class }
+ { "blank" non-newline-blank-class }
+ { "cntrl" control-character-class }
+ { "xdigit" hex-digit-class }
+ { "space" java-blank-class }
+ } [ unicode-class ] at-error ;
: lookup-escape ( char -- ast )
{
- { CHAR: t [ CHAR: \t ] }
+ { CHAR: a [ CHAR: \a ] }
+ { CHAR: e [ CHAR: \e ] }
+ { CHAR: f [ CHAR: \f ] }
{ CHAR: n [ CHAR: \n ] }
{ CHAR: r [ CHAR: \r ] }
- { CHAR: f [ HEX: c ] }
- { CHAR: a [ HEX: 7 ] }
- { CHAR: e [ HEX: 1b ] }
- { CHAR: \\ [ CHAR: \\ ] }
+ { CHAR: t [ CHAR: \t ] }
+ { CHAR: v [ CHAR: \v ] }
+ { CHAR: 0 [ CHAR: \0 ] }
{ CHAR: w [ c-identifier-class <primitive-class> ] }
{ CHAR: W [ c-identifier-class <primitive-class> <not-class> ] }
{ CHAR: z [ end-of-input <tagged-epsilon> ] }
{ CHAR: Z [ end-of-file <tagged-epsilon> ] }
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] }
+ { CHAR: b [ word-break <tagged-epsilon> ] }
+ { CHAR: B [ word-break <not-class> <tagged-epsilon> ] }
[ ]
} case ;
{ CHAR: i case-insensitive }
{ CHAR: d unix-lines }
{ CHAR: m multiline }
- { CHAR: n multiline }
{ CHAR: r reversed-regexp }
{ CHAR: s dotall }
- { CHAR: u unicode-case }
- { CHAR: x comments }
} ;
+ERROR: nonexistent-option name ;
+
: ch>option ( ch -- singleton )
- options-assoc at ;
+ [ options-assoc at ] [ nonexistent-option ] ?unless ;
: option>ch ( option -- string )
options-assoc value-at ;
: string>options ( string -- options )
"-" split1 parse-options ;
-
+
: options>string ( options -- string )
[ on>> ] [ off>> ] bi
[ [ option>ch ] map ] bi@
! add greedy and nongreedy forms of matching
! (once it's all implemented)
-EBNF: parse-regexp
+EBNF: parse-regexp [=[
CharacterInBracket = !("}") Character
QuotedCharacter = !("\\E") .
-Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
- | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
+Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> ]]
+ | "P{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> <not-class> ]]
| "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
| "u" Character:a Character:b Character:c Character:d
=> [[ { a b c d } hex> ensure-number ]]
EscapeSequence = "\\" Escape:e => [[ e ]]
Character = EscapeSequence
- | "$" => [[ $ <tagged-epsilon> ]]
- | "^" => [[ ^ <tagged-epsilon> ]]
+ | "$" => [[ $crlf <tagged-epsilon> ]]
+ | "^" => [[ ^crlf <tagged-epsilon> ]]
| . ?[ allowed-char? ]?
-AnyRangeCharacter = EscapeSequence | .
+AnyRangeCharacter = !("&&"|"||"|"--"|"~~") (EscapeSequence | .)
RangeCharacter = !("]") AnyRangeCharacter
-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+Range = RangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
| RangeCharacter
-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+StartRange = AnyRangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
| AnyRangeCharacter
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
-CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+BasicCharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+
+CharClass = BasicCharClass:b "&&" CharClass:c
+ => [[ b c 2array <and-class> ]]
+ | BasicCharClass:b "||" CharClass:c
+ => [[ b c 2array <or-class> ]]
+ | BasicCharClass:b "~~" CharClass:c
+ => [[ b c <sym-diff-class> ]]
+ | BasicCharClass:b "--" CharClass:c
+ => [[ b c <minus-class> ]]
+ | BasicCharClass
Options = [idmsux]*
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
- | "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]]
- | "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]]
- | "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]]
- | "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]]
+ | "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
+ | "?!" Alternation:a => [[ a <lookahead> <not-class> <tagged-epsilon> ]]
+ | "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
+ | "?<!" Alternation:a => [[ a <lookbehind> <not-class> <tagged-epsilon> ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]]
- | ".":d => [[ any-char <primitive-class> ]]
+ | ".":d => [[ dot ]]
| Character
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
End = !(.)
Main = Alternation End
-;EBNF
+]=]