! See http://factorcode.org/license.txt for BSD license.
USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
combinators regexp.classes strings splitting peg locals accessors
-regexp.ast unicode.case ;
+regexp.ast unicode.case unicode.script.private unicode.categories
+memoize interval-maps sets unicode.data combinators.short-circuit ;
IN: regexp.parser
: allowed-char? ( ch -- ? )
ERROR: bad-class name ;
+: simple ( str -- simple )
+ ! Alternatively, first collation key level?
+ >case-fold [ " \t_" member? not ] filter ;
+
+: simple-table ( seq -- table )
+ [ [ simple ] keep ] H{ } map>assoc ;
+
+MEMO: simple-script-table ( -- table )
+ script-table interval-values members simple-table ;
+
+MEMO: simple-category-table ( -- table )
+ categories simple-table ;
+
: parse-unicode-class ( name -- class )
- ! Implement this!
- drop f ;
+ {
+ { [ dup { [ length 1 = ] [ first "clmnpsz" member? ] } 1&& ] [
+ >upper first
+ <category-range-class>
+ ] }
+ { [ dup >title categories member? ] [
+ simple-category-table at <category-class>
+ ] }
+ { [ "script=" ?head ] [
+ dup simple-script-table at
+ [ <script-class> ]
+ [ "script=" prepend bad-class ] ?if
+ ] }
+ [ bad-class ]
+ } cond ;
: unicode-class ( name -- class )
dup parse-unicode-class [ ] [ bad-class ] ?if ;
: name>class ( name -- class )
- >string >case-fold {
+ >string simple {
{ "lower" letter-class }
{ "upper" LETTER-class }
{ "alpha" Letter-class }
{ CHAR: t [ CHAR: \t ] }
{ CHAR: n [ CHAR: \n ] }
{ CHAR: r [ CHAR: \r ] }
- { CHAR: f [ HEX: c ] }
- { CHAR: a [ HEX: 7 ] }
- { CHAR: e [ HEX: 1b ] }
+ { CHAR: f [ 0xc ] }
+ { CHAR: a [ 0x7 ] }
+ { CHAR: e [ 0x1b ] }
{ CHAR: \\ [ CHAR: \\ ] }
{ CHAR: w [ c-identifier-class <primitive-class> ] }
{ CHAR: s dotall }
} ;
+ERROR: nonexistent-option name ;
+
: ch>option ( ch -- singleton )
- options-assoc at ;
+ dup options-assoc at [ ] [ nonexistent-option ] ?if ;
: option>ch ( option -- string )
options-assoc value-at ;
QuotedCharacter = !("\\E") .
Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> ]]
- | "P{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> <negation> ]]
+ | "P{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> <not-class> ]]
| "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
| "u" Character:a Character:b Character:c Character:d
=> [[ { a b c d } hex> ensure-number ]]
| "^" => [[ ^ <tagged-epsilon> ]]
| . ?[ allowed-char? ]?
-AnyRangeCharacter = EscapeSequence | .
+AnyRangeCharacter = !("&&"|"||"|"--"|"~~") (EscapeSequence | .)
RangeCharacter = !("]") AnyRangeCharacter
-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
+Range = RangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
| RangeCharacter
-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
+StartRange = AnyRangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
| AnyRangeCharacter
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
-CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+BasicCharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+
+CharClass = BasicCharClass:b "&&" CharClass:c
+ => [[ b c 2array <and-class> ]]
+ | BasicCharClass:b "||" CharClass:c
+ => [[ b c 2array <or-class> ]]
+ | BasicCharClass:b "~~" CharClass:c
+ => [[ b c <sym-diff-class> ]]
+ | BasicCharClass:b "--" CharClass:c
+ => [[ b c <minus-class> ]]
+ | BasicCharClass
Options = [idmsux]*