1 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
4 combinators regexp.classes strings splitting peg locals accessors
5 regexp.ast unicode.case unicode.script.private unicode.categories
6 memoize interval-maps sets unicode.data combinators.short-circuit ;
9 : allowed-char? ( ch -- ? )
10 ".()|[*+?$^" member? not ;
14 : ensure-number ( n -- n )
15 [ bad-number ] unless* ;
17 :: at-error ( key assoc quot: ( key -- replacement ) -- value )
18 key assoc at* [ drop key quot call ] unless ; inline
20 ERROR: bad-class name ;
22 : simple ( str -- simple )
23 ! Alternatively, first collation key level?
24 >case-fold [ " \t_" member? not ] filter ;
26 : simple-table ( seq -- table )
27 [ [ simple ] keep ] H{ } map>assoc ;
29 MEMO: simple-script-table ( -- table )
30 script-table interval-values prune simple-table ;
32 MEMO: simple-category-table ( -- table )
33 categories simple-table ;
35 : parse-unicode-class ( name -- class )
37 { [ dup { [ length 1 = ] [ first "clmnpsz" member? ] } 1&& ] [
39 <category-range-class>
41 { [ dup >title categories member? ] [
42 simple-category-table at <category-class>
44 { [ "script=" ?head ] [
45 dup simple-script-table at
47 [ "script=" prepend bad-class ] ?if
52 : unicode-class ( name -- class )
53 dup parse-unicode-class [ ] [ bad-class ] ?if ;
55 : name>class ( name -- class )
57 { "lower" letter-class }
58 { "upper" LETTER-class }
59 { "alpha" Letter-class }
60 { "ascii" ascii-class }
61 { "digit" digit-class }
62 { "alnum" alpha-class }
63 { "punct" punctuation-class }
64 { "graph" java-printable-class }
65 { "blank" non-newline-blank-class }
66 { "cntrl" control-character-class }
67 { "xdigit" hex-digit-class }
68 { "space" java-blank-class }
69 } [ unicode-class ] at-error ;
71 : lookup-escape ( char -- ast )
73 { CHAR: t [ CHAR: \t ] }
74 { CHAR: n [ CHAR: \n ] }
75 { CHAR: r [ CHAR: \r ] }
76 { CHAR: f [ HEX: c ] }
77 { CHAR: a [ HEX: 7 ] }
78 { CHAR: e [ HEX: 1b ] }
79 { CHAR: \\ [ CHAR: \\ ] }
81 { CHAR: w [ c-identifier-class <primitive-class> ] }
82 { CHAR: W [ c-identifier-class <primitive-class> <not-class> ] }
83 { CHAR: s [ java-blank-class <primitive-class> ] }
84 { CHAR: S [ java-blank-class <primitive-class> <not-class> ] }
85 { CHAR: d [ digit-class <primitive-class> ] }
86 { CHAR: D [ digit-class <primitive-class> <not-class> ] }
88 { CHAR: z [ end-of-input <tagged-epsilon> ] }
89 { CHAR: Z [ end-of-file <tagged-epsilon> ] }
90 { CHAR: A [ beginning-of-input <tagged-epsilon> ] }
91 { CHAR: b [ word-break <tagged-epsilon> ] }
92 { CHAR: B [ word-break <not-class> <tagged-epsilon> ] }
96 : options-assoc ( -- assoc )
98 { CHAR: i case-insensitive }
99 { CHAR: d unix-lines }
100 { CHAR: m multiline }
101 { CHAR: r reversed-regexp }
105 ERROR: nonexistent-option name ;
107 : ch>option ( ch -- singleton )
108 dup options-assoc at [ ] [ nonexistent-option ] ?if ;
110 : option>ch ( option -- string )
111 options-assoc value-at ;
113 : parse-options ( on off -- options )
114 [ [ ch>option ] { } map-as ] bi@ <options> ;
116 : string>options ( string -- options )
117 "-" split1 parse-options ;
119 : options>string ( options -- string )
120 [ on>> ] [ off>> ] bi
121 [ [ option>ch ] map ] bi@
122 [ "-" glue ] unless-empty
125 ! TODO: add syntax for various parenthized things,
126 ! add greedy and nongreedy forms of matching
127 ! (once it's all implemented)
131 CharacterInBracket = !("}") Character
133 QuotedCharacter = !("\\E") .
135 Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> ]]
136 | "P{" CharacterInBracket*:s "}" => [[ s name>class <primitive-class> <negation> ]]
137 | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
138 | "u" Character:a Character:b Character:c Character:d
139 => [[ { a b c d } hex> ensure-number ]]
140 | "x" Character:a Character:b
141 => [[ { a b } hex> ensure-number ]]
142 | "0" Character:a Character:b Character:c
143 => [[ { a b c } oct> ensure-number ]]
144 | . => [[ lookup-escape ]]
146 EscapeSequence = "\\" Escape:e => [[ e ]]
148 Character = EscapeSequence
149 | "$" => [[ $ <tagged-epsilon> ]]
150 | "^" => [[ ^ <tagged-epsilon> ]]
151 | . ?[ allowed-char? ]?
153 AnyRangeCharacter = !("&&"|"||"|"--"|"~~") (EscapeSequence | .)
155 RangeCharacter = !("]") AnyRangeCharacter
157 Range = RangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
160 StartRange = AnyRangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
163 Ranges = StartRange:s Range*:r => [[ r s prefix ]]
165 BasicCharClass = "^"?:n Ranges:e => [[ e n char-class ]]
167 CharClass = BasicCharClass:b "&&" CharClass:c
168 => [[ b c 2array <and-class> ]]
169 | BasicCharClass:b "||" CharClass:c
170 => [[ b c 2array <or-class> ]]
171 | BasicCharClass:b "~~" CharClass:c
172 => [[ b c <sym-diff-class> ]]
173 | BasicCharClass:b "--" CharClass:c
174 => [[ b c <minus-class> ]]
179 Parenthized = "?:" Alternation:a => [[ a ]]
180 | "?" Options:on "-"? Options:off ":" Alternation:a
181 => [[ a on off parse-options <with-options> ]]
182 | "?#" [^)]* => [[ f ]]
183 | "?~" Alternation:a => [[ a <negation> ]]
184 | "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
185 | "?!" Alternation:a => [[ a <lookahead> <not-class> <tagged-epsilon> ]]
186 | "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
187 | "?<!" Alternation:a => [[ a <lookbehind> <not-class> <tagged-epsilon> ]]
190 Element = "(" Parenthized:p ")" => [[ p ]]
191 | "[" CharClass:r "]" => [[ r ]]
195 Number = (!(","|"}").)* => [[ string>number ensure-number ]]
197 Times = "," Number:n "}" => [[ 0 n <from-to> ]]
198 | Number:n ",}" => [[ n <at-least> ]]
199 | Number:n "}" => [[ n n <from-to> ]]
200 | "}" => [[ bad-number ]]
201 | Number:n "," Number:m "}" => [[ n m <from-to> ]]
203 Repeated = Element:e "{" Times:t => [[ e t <times> ]]
204 | Element:e "??" => [[ e <maybe> ]]
205 | Element:e "*?" => [[ e <star> ]]
206 | Element:e "+?" => [[ e <plus> ]]
207 | Element:e "?" => [[ e <maybe> ]]
208 | Element:e "*" => [[ e <star> ]]
209 | Element:e "+" => [[ e <plus> ]]
212 Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
214 Alternation = Concatenation:c ("|" Concatenation)*:a
215 => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
219 Main = Alternation End