fry macros arrays assocs sets classes mirrors ;
IN: regexp.classes
-SINGLETONS: any-char any-char-no-nl
-letter-class LETTER-class Letter-class digit-class
+SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
-SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
+SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
+^unix $unix word-break ;
TUPLE: range from to ;
C: <range> range
+TUPLE: primitive-class class ;
+C: <primitive-class> primitive-class
+
+TUPLE: category-class category ;
+C: <category-class> category-class
+
+TUPLE: category-range-class category ;
+C: <category-range-class> category-range-class
+
+TUPLE: script-class script ;
+C: <script-class> script-class
+
GENERIC: class-member? ( obj class -- ? )
M: t class-member? ( obj class -- ? ) 2drop t ;
M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
-M: any-char class-member? ( obj class -- ? )
- 2drop t ;
-
-M: any-char-no-nl class-member? ( obj class -- ? )
- drop CHAR: \n = not ;
-
M: letter-class class-member? ( obj class -- ? )
drop letter? ;
M: terminator-class class-member? ( obj class -- ? )
drop "\r\n\u000085\u002029\u002028" member? ;
-M: ^ class-member? ( obj class -- ? )
- 2drop f ;
+M: f class-member? 2drop f ;
-M: $ class-member? ( obj class -- ? )
- 2drop f ;
+M: script-class class-member?
+ [ script-of ] [ script>> ] bi* = ;
-M: f class-member? 2drop f ;
+M: category-class class-member?
+ [ category# ] [ category>> ] bi* = ;
-TUPLE: primitive-class class ;
-C: <primitive-class> primitive-class
+M: category-range-class class-member?
+ [ category first ] [ category>> ] bi* = ;
TUPLE: not-class class ;
M: not-class modify-class
class>> modify-class <not-class> ;
-M: any-char modify-class
- drop dotall option? t any-char-no-nl ? ;
+MEMO: unix-dot ( -- class )
+ CHAR: \n <not-class> ;
+
+MEMO: nonl-dot ( -- class )
+ { CHAR: \n CHAR: \r } <or-class> <not-class> ;
+
+M: dot modify-class
+ drop dotall option? [ t ] [
+ unix-lines option?
+ unix-dot nonl-dot ?
+ ] if ;
: modify-letter-class ( class -- newclass )
case-insensitive option? [ drop Letter-class ] when ;
ERROR: bad-class name ;
+: parse-unicode-class ( name -- class )
+ ! Implement this!
+ drop f ;
+
+: unicode-class ( name -- class )
+ parse-unicode-class [ bad-class ] unless* ;
+
: name>class ( name -- class )
>string >case-fold {
{ "lower" letter-class }
{ "cntrl" control-character-class }
{ "xdigit" hex-digit-class }
{ "space" java-blank-class }
- ! TODO: unicode-character-class
- } [ bad-class ] at-error ;
+ } [ unicode-class ] at-error ;
: lookup-escape ( char -- ast )
{
Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]]
- | ".":d => [[ any-char <primitive-class> ]]
+ | ".":d => [[ dot ]]
| Character
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
-! Copyright (C) 2008 Daniel Ehrenberg.
+! Copyright (C) 2008, 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: combinators.short-circuit assocs math kernel sequences
io.files hashtables quotations splitting grouping arrays io
: char>name ( char -- name ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
+: category# ( char -- category )
+ ! There are a few characters that should be Cn
+ ! that this gives Cf or Mn
+ ! Cf = 26; Mn = 5; Cn = 29
+ ! Use a compressed array instead?
+ dup category-map ?nth [ ] [
+ dup HEX: E0001 HEX: E007F between?
+ [ drop 26 ] [
+ HEX: E0100 HEX: E01EF between? 5 29 ?
+ ] if
+ ] ?if ;
+
+: category ( char -- category )
+ category# categories nth ;
+
! Loading data from UnicodeData.txt
: split-; ( line -- array )
load-properties to: properties
-! Utility to load resource files that look like Scripts.txt
-
-SYMBOL: interned
-
-: parse-script ( filename -- assoc )
- ! assoc is code point/range => name
- ascii file-lines filter-comments [ split-; ] map ;
-
-: range, ( value key -- )
- swap interned get
- [ = ] with find nip 2array , ;
-
-: expand-ranges ( assoc -- interval-map )
- [
- [
- swap CHAR: . over member? [
- ".." split1 [ hex> ] bi@ 2array
- ] [ hex> ] if range,
- ] assoc-each
- ] { } make <interval-map> ;
-
-: process-script ( ranges -- table )
- dup values prune interned
- [ expand-ranges ] with-variable ;
-
-: load-script ( filename -- table )
- parse-script process-script ;
-
[ name>char [ "Invalid character" throw ] unless* ]
name>char-hook set-global
unicode.data ;
IN: unicode.script
+<PRIVATE
+
+SYMBOL: interned
+
+: parse-script ( filename -- assoc )
+ ! assoc is code point/range => name
+ ascii file-lines filter-comments [ split-; ] map ;
+
+: range, ( value key -- )
+ swap interned get
+ [ = ] with find nip 2array , ;
+
+: expand-ranges ( assoc -- interval-map )
+ [
+ [
+ swap CHAR: . over member? [
+ ".." split1 [ hex> ] bi@ 2array
+ ] [ hex> ] if range,
+ ] assoc-each
+ ] { } make <interval-map> ;
+
+: process-script ( ranges -- table )
+ dup values prune interned
+ [ expand-ranges ] with-variable ;
+
+: load-script ( filename -- table )
+ parse-script process-script ;
+
VALUE: script-table
"vocab:unicode/script/Scripts.txt" load-script
to: script-table
+PRIVATE>
+
: script-of ( char -- script )
script-table interval-at ;
assocs classes.predicate math.order strings.parser ;
IN: unicode.syntax
-! Character classes (categories)
-
-: category# ( char -- category )
- ! There are a few characters that should be Cn
- ! that this gives Cf or Mn
- ! Cf = 26; Mn = 5; Cn = 29
- ! Use a compressed array instead?
- dup category-map ?nth [ ] [
- dup HEX: E0001 HEX: E007F between?
- [ drop 26 ] [
- HEX: E0100 HEX: E01EF between? 5 29 ?
- ] if
- ] ?if ;
-
-: category ( char -- category )
- category# categories nth ;
+<PRIVATE
: >category-array ( categories -- bitarray )
categories [ swap member? ] with map >bit-array ;
: define-category ( word categories -- )
[category] integer swap define-predicate-class ;
+PRIVATE>
+
: CATEGORY:
CREATE ";" parse-tokens define-category ; parsing