Merge branch 'master' of git://factorcode.org/git/factor

author Daniel Ehrenberg <littledan@Macintosh-122.(none)>

Wed, 18 Mar 2009 23:12:18 +0000 (18:12 -0500)

committer Daniel Ehrenberg <littledan@Macintosh-122.(none)>

Wed, 18 Mar 2009 23:12:18 +0000 (18:12 -0500)
author Daniel Ehrenberg <littledan@Macintosh-122.(none)>
Wed, 18 Mar 2009 23:12:18 +0000 (18:12 -0500)
committer Daniel Ehrenberg <littledan@Macintosh-122.(none)>
Wed, 18 Mar 2009 23:12:18 +0000 (18:12 -0500)
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor

index 1c11ed5c7d58070ba5e51d29d48d2fb605963714..be657227e521a2c522a0adc20ff34bb4e6d6fdc7 100644 (file)
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@@ -1,7 +1,7 @@
  ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel arrays accessors fry sequences regexp.classes ;
-FROM: math.ranges => [a,b] ;
+USING: kernel arrays accessors fry sequences regexp.classes
+math.ranges math ;
  IN: regexp.ast
  
  TUPLE: negation term ;
@@ -49,10 +49,20 @@ SINGLETONS: unix-lines dotall multiline case-insensitive reversed-regexp ;
      <array> <concatenation> ;
  
  GENERIC: <times> ( term times -- term' )
+
  M: at-least <times>
      n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
+
+: to-times ( term n -- ast )
+    dup zero?
+    [ 2drop epsilon ]
+    [ dupd 1- to-times 2array <concatenation> <maybe> ]
+    if ;
+
  M: from-to <times>
-    [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
+    [ n>> swap repetition ]
+    [ [ m>> ] [ n>> ] bi - to-times ] 2bi
+    2array <concatenation> ;
  
  : char-class ( ranges ? -- term )
      [ <or-class> ] dip [ <not-class> ] when ;
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index e3a177458591bff0d0b99d4ce6f2ebd75e31afef..28b0ed1563441aa7a410fa04fc50e434c6f685cd 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -2,20 +2,33 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: accessors kernel math math.order words combinators locals
  ascii unicode.categories combinators.short-circuit sequences
-fry macros arrays assocs sets classes mirrors ;
+fry macros arrays assocs sets classes mirrors unicode.script
+unicode.data ;
  IN: regexp.classes
  
-SINGLETONS: any-char any-char-no-nl
-letter-class LETTER-class Letter-class digit-class
+SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
  alpha-class non-newline-blank-class
  ascii-class punctuation-class java-printable-class blank-class
  control-character-class hex-digit-class java-blank-class c-identifier-class
  unmatchable-class terminator-class word-boundary-class ;
  
-SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
+SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
+^unix $unix word-break ;
  
-TUPLE: range from to ;
-C: <range> range
+TUPLE: range-class from to ;
+C: <range-class> range-class
+
+TUPLE: primitive-class class ;
+C: <primitive-class> primitive-class
+
+TUPLE: category-class category ;
+C: <category-class> category-class
+
+TUPLE: category-range-class category ;
+C: <category-range-class> category-range-class
+
+TUPLE: script-class script ;
+C: <script-class> script-class
  
  GENERIC: class-member? ( obj class -- ? )
  
@@ -23,15 +36,9 @@ M: t class-member? ( obj class -- ? ) 2drop t ;
  
  M: integer class-member? ( obj class -- ? ) = ;
  
-M: range class-member? ( obj class -- ? )
+M: range-class class-member? ( obj class -- ? )
      [ from>> ] [ to>> ] bi between? ;
  
-M: any-char class-member? ( obj class -- ? )
-    2drop t ;
-
-M: any-char-no-nl class-member? ( obj class -- ? )
-    drop CHAR: \n = not ;
-
  M: letter-class class-member? ( obj class -- ? )
      drop letter? ;
              
@@ -99,21 +106,24 @@ M: unmatchable-class class-member? ( obj class -- ? )
  M: terminator-class class-member? ( obj class -- ? )
      drop "\r\n\u000085\u002029\u002028" member? ;
  
-M: ^ class-member? ( obj class -- ? )
-    2drop f ;
+M: f class-member? 2drop f ;
  
-M: $ class-member? ( obj class -- ? )
-    2drop f ;
+M: script-class class-member?
+    [ script-of ] [ script>> ] bi* = ;
  
-M: f class-member? 2drop f ;
+M: category-class class-member?
+    [ category# ] [ category>> ] bi* = ;
  
-TUPLE: primitive-class class ;
-C: <primitive-class> primitive-class
+M: category-range-class class-member?
+    [ category first ] [ category>> ] bi* = ;
  
  TUPLE: not-class class ;
  
  PREDICATE: not-integer < not-class class>> integer? ;
-PREDICATE: not-primitive < not-class class>> primitive-class? ;
+
+UNION: simple-class
+    primitive-class range-class category-class category-range-class dot ;
+PREDICATE: not-simple < not-class class>> simple-class? ;
  
  M: not-class class-member?
      class>> class-member? not ;
@@ -140,14 +150,14 @@ DEFER: substitute
          [ drop class new seq { } like >>seq ]
      } case ; inline
  
-TUPLE: class-partition integers not-integers primitives not-primitives and or other ;
+TUPLE: class-partition integers not-integers simples not-simples and or other ;
  
  : partition-classes ( seq -- class-partition )
      prune
      [ integer? ] partition
      [ not-integer? ] partition
-    [ primitive-class? ] partition ! extend primitive-class to epsilon tags
-    [ not-primitive? ] partition
+    [ simple-class? ] partition
+    [ not-simple? ] partition
      [ and-class? ] partition
      [ or-class? ] partition
      class-partition boa ;
@@ -161,17 +171,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot
  
  : filter-not-integers ( partition -- partition' )
      dup
-    [ primitives>> ] [ not-primitives>> ] [ or>> ] tri
+    [ simples>> ] [ not-simples>> ] [ or>> ] tri
      3append and-class boa
      '[ [ class>> _ class-member? ] filter ] change-not-integers ;
  
  : answer-ors ( partition -- partition' )
-    dup [ not-integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
+    dup [ not-integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
      '[ [ _ [ t substitute ] each ] map ] change-or ;
  
  : contradiction? ( partition -- ? )
      {
-        [ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
+        [ [ simples>> ] [ not-simples>> ] bi intersects? ]
          [ other>> f swap member? ]
      } 1|| ;
  
@@ -192,17 +202,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot
  
  : filter-integers ( partition -- partition' )
      dup
-    [ primitives>> ] [ not-primitives>> ] [ and>> ] tri
+    [ simples>> ] [ not-simples>> ] [ and>> ] tri
      3append or-class boa
      '[ [ _ class-member? not ] filter ] change-integers ;
  
  : answer-ands ( partition -- partition' )
-    dup [ integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
+    dup [ integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
      '[ [ _ [ f substitute ] each ] map ] change-and ;
  
  : tautology? ( partition -- ? )
      {
-        [ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
+        [ [ simples>> ] [ not-simples>> ] bi intersects? ]
          [ other>> t swap member? ]
      } 1|| ;
  
@@ -241,8 +251,6 @@ M: f <not-class> drop t ;
  M: primitive-class class-member?
      class>> class-member? ;
  
-UNION: class primitive-class not-class or-class and-class range ;
-
  TUPLE: condition question yes no ;
  C: <condition> condition
  
diff --git a/basis/regexp/combinators/combinators.factor b/basis/regexp/combinators/combinators.factor

index 2941afd99e59c9aa96f6bd423ef823c179fbd315..3bb5fcef6d96ca8f692b8b859f52ef8fdf9f61dc 100644 (file)
--- a/basis/regexp/combinators/combinators.factor
+++ b/basis/regexp/combinators/combinators.factor
@@ -13,14 +13,14 @@ IN: regexp.combinators
  
  PRIVATE>
  
-CONSTANT: <nothing> R/ (?~.*)/
+CONSTANT: <nothing> R/ (?~.*)/s
  
  : <literal> ( string -- regexp )
      [ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
  
  : <char-range> ( char1 char2 -- regexp )
      [ [ "[" "-" surround ] [ "]" append ] bi* append ]
-    [ <range> ]
+    [ <range-class> ]
      2bi make-regexp ;
  
  : <or> ( regexps -- disjunction )
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor

index d59d4818ec7ef5926a8dbd13ca4f9c5c61bdf347..f04e88070a059acd8aef6ef23fd2e2766826a410 100644 (file)
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -4,7 +4,7 @@ USING: accessors arrays assocs grouping kernel locals math namespaces
  sequences fry quotations math.order math.ranges vectors
  unicode.categories regexp.transition-tables words sets hashtables
  combinators.short-circuit unicode.case unicode.case.private regexp.ast
-regexp.classes ;
+regexp.classes memoize ;
  IN: regexp.nfa
  
  ! This uses unicode.case.private for ch>upper and ch>lower
@@ -117,8 +117,17 @@ M: or-class modify-class
  M: not-class modify-class
      class>> modify-class <not-class> ;
  
-M: any-char modify-class
-    drop dotall option? t any-char-no-nl ? ;
+MEMO: unix-dot ( -- class )
+    CHAR: \n <not-class> ;
+
+MEMO: nonl-dot ( -- class )
+    { CHAR: \n CHAR: \r } <or-class> <not-class> ;
+
+M: dot modify-class
+    drop dotall option? [ t ] [
+        unix-lines option?
+        unix-dot nonl-dot ?
+    ] if ;
  
  : modify-letter-class ( class -- newclass )
      case-insensitive option? [ drop Letter-class ] when ;
@@ -131,17 +140,17 @@ M: LETTER-class modify-class modify-letter-class ;
          [ [ LETTER? ] bi@ and ]
      } 2|| ;
  
-M: range modify-class
+M: range-class modify-class
      case-insensitive option? [
          dup cased-range? [
              [ from>> ] [ to>> ] bi
-            [ [ ch>lower ] bi@ <range> ]
-            [ [ ch>upper ] bi@ <range> ] 2bi 
+            [ [ ch>lower ] bi@ <range-class> ]
+            [ [ ch>upper ] bi@ <range-class> ] 2bi 
              2array <or-class>
          ] when
      ] when ;
  
-M: class nfa-node
+M: object nfa-node
      modify-class add-simple-entry ;
  
  M: with-options nfa-node ( node -- start end )
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index 7b2d6af2c1d17afb1fc8cd0de6d73ce5f22330e5..bf5465e0e2607f0e8142360dfa9f84dde122c81a 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -18,6 +18,13 @@ ERROR: bad-number ;
  
  ERROR: bad-class name ;
  
+: parse-unicode-class ( name -- class )
+    ! Implement this!
+    drop f ;
+
+: unicode-class ( name -- class )
+    dup parse-unicode-class [ ] [ bad-class ] ?if ;
+
  : name>class ( name -- class )
      >string >case-fold {
          { "lower" letter-class }
@@ -32,8 +39,7 @@ ERROR: bad-class name ;
          { "cntrl" control-character-class }
          { "xdigit" hex-digit-class }
          { "space" java-blank-class }
-        ! TODO: unicode-character-class
-    } [ bad-class ] at-error ;
+    } [ unicode-class ] at-error ;
  
  : lookup-escape ( char -- ast )
      {
@@ -119,10 +125,10 @@ AnyRangeCharacter = EscapeSequence | .
  
  RangeCharacter = !("]") AnyRangeCharacter
  
-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
        | RangeCharacter
  
-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
             | AnyRangeCharacter
  
  Ranges = StartRange:s Range*:r => [[ r s prefix ]]
@@ -144,7 +150,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
  
  Element = "(" Parenthized:p ")" => [[ p ]]
          | "[" CharClass:r "]" => [[ r ]]
-        | ".":d => [[ any-char <primitive-class> ]]
+        | ".":d => [[ dot ]]
          | Character
  
  Number = (!(","|"}").)* => [[ string>number ensure-number ]]
diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor

index 74914e8537cd37b6a31935281b0322f7483fe943..a1f663d03a30eee34919aa940918a7313d3953f0 100644 (file)
--- a/basis/unicode/data/data.factor
+++ b/basis/unicode/data/data.factor
@@ -1,4 +1,4 @@
-! Copyright (C) 2008 Daniel Ehrenberg.
+! Copyright (C) 2008, 2009 Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: combinators.short-circuit assocs math kernel sequences
  io.files hashtables quotations splitting grouping arrays io
@@ -29,6 +29,37 @@ VALUE: properties
  : char>name ( char -- name ) name-map value-at ;
  : property? ( char property -- ? ) properties at interval-key? ;
  
+! For non-existent characters, use Cn
+CONSTANT: categories
+    { "Cn"
+      "Lu" "Ll" "Lt" "Lm" "Lo"
+      "Mn" "Mc" "Me"
+      "Nd" "Nl" "No"
+      "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
+      "Sm" "Sc" "Sk" "So"
+      "Zs" "Zl" "Zp"
+      "Cc" "Cf" "Cs" "Co" }
+
+MEMO: categories-map ( -- hashtable )
+    categories <enum> [ swap ] H{ } assoc-map-as ;
+
+CONSTANT: num-chars HEX: 2FA1E
+
+: category# ( char -- category )
+    ! There are a few characters that should be Cn
+    ! that this gives Cf or Mn
+    ! Cf = 26; Mn = 5; Cn = 29
+    ! Use a compressed array instead?
+    dup category-map ?nth [ ] [
+        dup HEX: E0001 HEX: E007F between?
+        [ drop 26 ] [
+            HEX: E0100 HEX: E01EF between?  5 29 ?
+        ] if
+    ] ?if ;
+
+: category ( char -- category )
+    category# categories nth ;
+
  ! Loading data from UnicodeData.txt
  
  : split-; ( line -- array )
@@ -97,22 +128,6 @@ VALUE: properties
      [ nip zero? not ] assoc-filter
      >hashtable ;
  
-! For non-existent characters, use Cn
-CONSTANT: categories
-    { "Cn"
-      "Lu" "Ll" "Lt" "Lm" "Lo"
-      "Mn" "Mc" "Me"
-      "Nd" "Nl" "No"
-      "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
-      "Sm" "Sc" "Sk" "So"
-      "Zs" "Zl" "Zp"
-      "Cc" "Cf" "Cs" "Co" }
-
-MEMO: categories-map ( -- hashtable )
-    categories <enum> [ swap ] H{ } assoc-map-as ;
-
-CONSTANT: num-chars HEX: 2FA1E
-
  ! the maximum unicode char in the first 3 planes
  
  : ?set-nth ( val index seq -- )
@@ -195,33 +210,5 @@ load-special-casing to: special-casing
  
  load-properties to: properties
  
-! Utility to load resource files that look like Scripts.txt
-
-SYMBOL: interned
-
-: parse-script ( filename -- assoc )
-    ! assoc is code point/range => name
-    ascii file-lines filter-comments [ split-; ] map ;
-
-: range, ( value key -- )
-    swap interned get
-    [ = ] with find nip 2array , ;
-
-: expand-ranges ( assoc -- interval-map )
-    [
-        [
-            swap CHAR: . over member? [
-                ".." split1 [ hex> ] bi@ 2array
-            ] [ hex> ] if range,
-        ] assoc-each
-    ] { } make <interval-map> ;
-
-: process-script ( ranges -- table )
-    dup values prune interned
-    [ expand-ranges ] with-variable ;
-
-: load-script ( filename -- table )
-    parse-script process-script ;
-
  [ name>char [ "Invalid character" throw ] unless* ]
  name>char-hook set-global
diff --git a/basis/unicode/script/script.factor b/basis/unicode/script/script.factor

index 383f9e3de3ca4c225325af461283327d29e1f888..c8f818dbaa226d065b5dc5f6d36f22974cc32da6 100644 (file)
--- a/basis/unicode/script/script.factor
+++ b/basis/unicode/script/script.factor
@@ -7,10 +7,40 @@ words words.symbol compiler.units arrays interval-maps
  unicode.data ;
  IN: unicode.script
  
+<PRIVATE
+
+SYMBOL: interned
+
+: parse-script ( filename -- assoc )
+    ! assoc is code point/range => name
+    ascii file-lines filter-comments [ split-; ] map ;
+
+: range, ( value key -- )
+    swap interned get
+    [ = ] with find nip 2array , ;
+
+: expand-ranges ( assoc -- interval-map )
+    [
+        [
+            swap CHAR: . over member? [
+                ".." split1 [ hex> ] bi@ 2array
+            ] [ hex> ] if range,
+        ] assoc-each
+    ] { } make <interval-map> ;
+
+: process-script ( ranges -- table )
+    dup values prune interned
+    [ expand-ranges ] with-variable ;
+
+: load-script ( filename -- table )
+    parse-script process-script ;
+
  VALUE: script-table
  
  "vocab:unicode/script/Scripts.txt" load-script
  to: script-table
  
+PRIVATE>
+
  : script-of ( char -- script )
      script-table interval-at ;
diff --git a/basis/unicode/syntax/syntax.factor b/basis/unicode/syntax/syntax.factor

index b7ac022d0e1cc7cc49261d3a7340ff5a3ec40caf..5bd8c05e153103658a6df5b25afdb10c49f503fa 100644 (file)
--- a/basis/unicode/syntax/syntax.factor
+++ b/basis/unicode/syntax/syntax.factor
@@ -5,22 +5,7 @@ bit-arrays namespaces make sequences.private arrays quotations
  assocs classes.predicate math.order strings.parser ;
  IN: unicode.syntax
  
-! Character classes (categories)
-
-: category# ( char -- category )
-    ! There are a few characters that should be Cn
-    ! that this gives Cf or Mn
-    ! Cf = 26; Mn = 5; Cn = 29
-    ! Use a compressed array instead?
-    dup category-map ?nth [ ] [
-        dup HEX: E0001 HEX: E007F between?
-        [ drop 26 ] [
-            HEX: E0100 HEX: E01EF between?  5 29 ?
-        ] if
-    ] ?if ;
-
-: category ( char -- category )
-    category# categories nth ;
+<PRIVATE
  
  : >category-array ( categories -- bitarray )
      categories [ swap member? ] with map >bit-array ;
@@ -40,6 +25,8 @@ IN: unicode.syntax
  : define-category ( word categories -- )
      [category] integer swap define-predicate-class ;
  
+PRIVATE>
+
  : CATEGORY:
      CREATE ";" parse-tokens define-category ; parsing
author	Daniel Ehrenberg <littledan@Macintosh-122.(none)>
	Wed, 18 Mar 2009 23:12:18 +0000 (18:12 -0500)
committer	Daniel Ehrenberg <littledan@Macintosh-122.(none)>
	Wed, 18 Mar 2009 23:12:18 +0000 (18:12 -0500)
basis/regexp/ast/ast.factor		patch \| blob \| history
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/combinators/combinators.factor		patch \| blob \| history
basis/regexp/nfa/nfa.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/unicode/data/data.factor		patch \| blob \| history
basis/unicode/script/script.factor		patch \| blob \| history
basis/unicode/syntax/syntax.factor		patch \| blob \| history