! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: kernel arrays accessors fry sequences regexp.classes ;
-FROM: math.ranges => [a,b] ;
+USING: kernel arrays accessors fry sequences regexp.classes
+math.ranges math ;
IN: regexp.ast
TUPLE: negation term ;
<array> <concatenation> ;
GENERIC: <times> ( term times -- term' )
+
M: at-least <times>
n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
+
+: to-times ( term n -- ast )
+ dup zero?
+ [ 2drop epsilon ]
+ [ dupd 1- to-times 2array <concatenation> <maybe> ]
+ if ;
+
M: from-to <times>
- [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
+ [ n>> swap repetition ]
+ [ [ m>> ] [ n>> ] bi - to-times ] 2bi
+ 2array <concatenation> ;
: char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ;
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words combinators locals
ascii unicode.categories combinators.short-circuit sequences
-fry macros arrays assocs sets classes mirrors ;
+fry macros arrays assocs sets classes mirrors unicode.script
+unicode.data ;
IN: regexp.classes
-SINGLETONS: any-char any-char-no-nl
-letter-class LETTER-class Letter-class digit-class
+SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
-SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
+SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
+^unix $unix word-break ;
-TUPLE: range from to ;
-C: <range> range
+TUPLE: range-class from to ;
+C: <range-class> range-class
+
+TUPLE: primitive-class class ;
+C: <primitive-class> primitive-class
+
+TUPLE: category-class category ;
+C: <category-class> category-class
+
+TUPLE: category-range-class category ;
+C: <category-range-class> category-range-class
+
+TUPLE: script-class script ;
+C: <script-class> script-class
GENERIC: class-member? ( obj class -- ? )
M: integer class-member? ( obj class -- ? ) = ;
-M: range class-member? ( obj class -- ? )
+M: range-class class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
-M: any-char class-member? ( obj class -- ? )
- 2drop t ;
-
-M: any-char-no-nl class-member? ( obj class -- ? )
- drop CHAR: \n = not ;
-
M: letter-class class-member? ( obj class -- ? )
drop letter? ;
M: terminator-class class-member? ( obj class -- ? )
drop "\r\n\u000085\u002029\u002028" member? ;
-M: ^ class-member? ( obj class -- ? )
- 2drop f ;
+M: f class-member? 2drop f ;
-M: $ class-member? ( obj class -- ? )
- 2drop f ;
+M: script-class class-member?
+ [ script-of ] [ script>> ] bi* = ;
-M: f class-member? 2drop f ;
+M: category-class class-member?
+ [ category# ] [ category>> ] bi* = ;
-TUPLE: primitive-class class ;
-C: <primitive-class> primitive-class
+M: category-range-class class-member?
+ [ category first ] [ category>> ] bi* = ;
TUPLE: not-class class ;
PREDICATE: not-integer < not-class class>> integer? ;
-PREDICATE: not-primitive < not-class class>> primitive-class? ;
+
+UNION: simple-class
+ primitive-class range-class category-class category-range-class dot ;
+PREDICATE: not-simple < not-class class>> simple-class? ;
M: not-class class-member?
class>> class-member? not ;
[ drop class new seq { } like >>seq ]
} case ; inline
-TUPLE: class-partition integers not-integers primitives not-primitives and or other ;
+TUPLE: class-partition integers not-integers simples not-simples and or other ;
: partition-classes ( seq -- class-partition )
prune
[ integer? ] partition
[ not-integer? ] partition
- [ primitive-class? ] partition ! extend primitive-class to epsilon tags
- [ not-primitive? ] partition
+ [ simple-class? ] partition
+ [ not-simple? ] partition
[ and-class? ] partition
[ or-class? ] partition
class-partition boa ;
: filter-not-integers ( partition -- partition' )
dup
- [ primitives>> ] [ not-primitives>> ] [ or>> ] tri
+ [ simples>> ] [ not-simples>> ] [ or>> ] tri
3append and-class boa
'[ [ class>> _ class-member? ] filter ] change-not-integers ;
: answer-ors ( partition -- partition' )
- dup [ not-integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
+ dup [ not-integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
'[ [ _ [ t substitute ] each ] map ] change-or ;
: contradiction? ( partition -- ? )
{
- [ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
+ [ [ simples>> ] [ not-simples>> ] bi intersects? ]
[ other>> f swap member? ]
} 1|| ;
: filter-integers ( partition -- partition' )
dup
- [ primitives>> ] [ not-primitives>> ] [ and>> ] tri
+ [ simples>> ] [ not-simples>> ] [ and>> ] tri
3append or-class boa
'[ [ _ class-member? not ] filter ] change-integers ;
: answer-ands ( partition -- partition' )
- dup [ integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
+ dup [ integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
'[ [ _ [ f substitute ] each ] map ] change-and ;
: tautology? ( partition -- ? )
{
- [ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
+ [ [ simples>> ] [ not-simples>> ] bi intersects? ]
[ other>> t swap member? ]
} 1|| ;
M: primitive-class class-member?
class>> class-member? ;
-UNION: class primitive-class not-class or-class and-class range ;
-
TUPLE: condition question yes no ;
C: <condition> condition
PRIVATE>
-CONSTANT: <nothing> R/ (?~.*)/
+CONSTANT: <nothing> R/ (?~.*)/s
: <literal> ( string -- regexp )
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
: <char-range> ( char1 char2 -- regexp )
[ [ "[" "-" surround ] [ "]" append ] bi* append ]
- [ <range> ]
+ [ <range-class> ]
2bi make-regexp ;
: <or> ( regexps -- disjunction )
sequences fry quotations math.order math.ranges vectors
unicode.categories regexp.transition-tables words sets hashtables
combinators.short-circuit unicode.case unicode.case.private regexp.ast
-regexp.classes ;
+regexp.classes memoize ;
IN: regexp.nfa
! This uses unicode.case.private for ch>upper and ch>lower
M: not-class modify-class
class>> modify-class <not-class> ;
-M: any-char modify-class
- drop dotall option? t any-char-no-nl ? ;
+MEMO: unix-dot ( -- class )
+ CHAR: \n <not-class> ;
+
+MEMO: nonl-dot ( -- class )
+ { CHAR: \n CHAR: \r } <or-class> <not-class> ;
+
+M: dot modify-class
+ drop dotall option? [ t ] [
+ unix-lines option?
+ unix-dot nonl-dot ?
+ ] if ;
: modify-letter-class ( class -- newclass )
case-insensitive option? [ drop Letter-class ] when ;
[ [ LETTER? ] bi@ and ]
} 2|| ;
-M: range modify-class
+M: range-class modify-class
case-insensitive option? [
dup cased-range? [
[ from>> ] [ to>> ] bi
- [ [ ch>lower ] bi@ <range> ]
- [ [ ch>upper ] bi@ <range> ] 2bi
+ [ [ ch>lower ] bi@ <range-class> ]
+ [ [ ch>upper ] bi@ <range-class> ] 2bi
2array <or-class>
] when
] when ;
-M: class nfa-node
+M: object nfa-node
modify-class add-simple-entry ;
M: with-options nfa-node ( node -- start end )
ERROR: bad-class name ;
+: parse-unicode-class ( name -- class )
+ ! Implement this!
+ drop f ;
+
+: unicode-class ( name -- class )
+ dup parse-unicode-class [ ] [ bad-class ] ?if ;
+
: name>class ( name -- class )
>string >case-fold {
{ "lower" letter-class }
{ "cntrl" control-character-class }
{ "xdigit" hex-digit-class }
{ "space" java-blank-class }
- ! TODO: unicode-character-class
- } [ bad-class ] at-error ;
+ } [ unicode-class ] at-error ;
: lookup-escape ( char -- ast )
{
RangeCharacter = !("]") AnyRangeCharacter
-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
| RangeCharacter
-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
| AnyRangeCharacter
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]]
- | ".":d => [[ any-char <primitive-class> ]]
+ | ".":d => [[ dot ]]
| Character
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
-! Copyright (C) 2008 Daniel Ehrenberg.
+! Copyright (C) 2008, 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: combinators.short-circuit assocs math kernel sequences
io.files hashtables quotations splitting grouping arrays io
: char>name ( char -- name ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
+! For non-existent characters, use Cn
+CONSTANT: categories
+ { "Cn"
+ "Lu" "Ll" "Lt" "Lm" "Lo"
+ "Mn" "Mc" "Me"
+ "Nd" "Nl" "No"
+ "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
+ "Sm" "Sc" "Sk" "So"
+ "Zs" "Zl" "Zp"
+ "Cc" "Cf" "Cs" "Co" }
+
+MEMO: categories-map ( -- hashtable )
+ categories <enum> [ swap ] H{ } assoc-map-as ;
+
+CONSTANT: num-chars HEX: 2FA1E
+
+: category# ( char -- category )
+ ! There are a few characters that should be Cn
+ ! that this gives Cf or Mn
+ ! Cf = 26; Mn = 5; Cn = 29
+ ! Use a compressed array instead?
+ dup category-map ?nth [ ] [
+ dup HEX: E0001 HEX: E007F between?
+ [ drop 26 ] [
+ HEX: E0100 HEX: E01EF between? 5 29 ?
+ ] if
+ ] ?if ;
+
+: category ( char -- category )
+ category# categories nth ;
+
! Loading data from UnicodeData.txt
: split-; ( line -- array )
[ nip zero? not ] assoc-filter
>hashtable ;
-! For non-existent characters, use Cn
-CONSTANT: categories
- { "Cn"
- "Lu" "Ll" "Lt" "Lm" "Lo"
- "Mn" "Mc" "Me"
- "Nd" "Nl" "No"
- "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
- "Sm" "Sc" "Sk" "So"
- "Zs" "Zl" "Zp"
- "Cc" "Cf" "Cs" "Co" }
-
-MEMO: categories-map ( -- hashtable )
- categories <enum> [ swap ] H{ } assoc-map-as ;
-
-CONSTANT: num-chars HEX: 2FA1E
-
! the maximum unicode char in the first 3 planes
: ?set-nth ( val index seq -- )
load-properties to: properties
-! Utility to load resource files that look like Scripts.txt
-
-SYMBOL: interned
-
-: parse-script ( filename -- assoc )
- ! assoc is code point/range => name
- ascii file-lines filter-comments [ split-; ] map ;
-
-: range, ( value key -- )
- swap interned get
- [ = ] with find nip 2array , ;
-
-: expand-ranges ( assoc -- interval-map )
- [
- [
- swap CHAR: . over member? [
- ".." split1 [ hex> ] bi@ 2array
- ] [ hex> ] if range,
- ] assoc-each
- ] { } make <interval-map> ;
-
-: process-script ( ranges -- table )
- dup values prune interned
- [ expand-ranges ] with-variable ;
-
-: load-script ( filename -- table )
- parse-script process-script ;
-
[ name>char [ "Invalid character" throw ] unless* ]
name>char-hook set-global
unicode.data ;
IN: unicode.script
+<PRIVATE
+
+SYMBOL: interned
+
+: parse-script ( filename -- assoc )
+ ! assoc is code point/range => name
+ ascii file-lines filter-comments [ split-; ] map ;
+
+: range, ( value key -- )
+ swap interned get
+ [ = ] with find nip 2array , ;
+
+: expand-ranges ( assoc -- interval-map )
+ [
+ [
+ swap CHAR: . over member? [
+ ".." split1 [ hex> ] bi@ 2array
+ ] [ hex> ] if range,
+ ] assoc-each
+ ] { } make <interval-map> ;
+
+: process-script ( ranges -- table )
+ dup values prune interned
+ [ expand-ranges ] with-variable ;
+
+: load-script ( filename -- table )
+ parse-script process-script ;
+
VALUE: script-table
"vocab:unicode/script/Scripts.txt" load-script
to: script-table
+PRIVATE>
+
: script-of ( char -- script )
script-table interval-at ;
assocs classes.predicate math.order strings.parser ;
IN: unicode.syntax
-! Character classes (categories)
-
-: category# ( char -- category )
- ! There are a few characters that should be Cn
- ! that this gives Cf or Mn
- ! Cf = 26; Mn = 5; Cn = 29
- ! Use a compressed array instead?
- dup category-map ?nth [ ] [
- dup HEX: E0001 HEX: E007F between?
- [ drop 26 ] [
- HEX: E0100 HEX: E01EF between? 5 29 ?
- ] if
- ] ?if ;
-
-: category ( char -- category )
- category# categories nth ;
+<PRIVATE
: >category-array ( categories -- bitarray )
categories [ swap member? ] with map >bit-array ;
: define-category ( word categories -- )
[category] integer swap define-predicate-class ;
+PRIVATE>
+
: CATEGORY:
CREATE ";" parse-tokens define-category ; parsing