Reorganizing Unicode data, so that the data tables are private; ch>upper moves to...

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Thu, 19 Mar 2009 02:04:36 +0000 (21:04 -0500)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Thu, 19 Mar 2009 02:04:36 +0000 (21:04 -0500)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Thu, 19 Mar 2009 02:04:36 +0000 (21:04 -0500)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Thu, 19 Mar 2009 02:04:36 +0000 (21:04 -0500)
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor

index f04e88070a059acd8aef6ef23fd2e2766826a410..a692f707780f239754fe7570ce116f580f304542 100644 (file)
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -3,11 +3,11 @@
  USING: accessors arrays assocs grouping kernel locals math namespaces
  sequences fry quotations math.order math.ranges vectors
  unicode.categories regexp.transition-tables words sets hashtables
-combinators.short-circuit unicode.case unicode.case.private regexp.ast
+combinators.short-circuit unicode.data regexp.ast
  regexp.classes memoize ;
  IN: regexp.nfa
  
-! This uses unicode.case.private for ch>upper and ch>lower
+! This uses unicode.data for ch>upper and ch>lower
  ! but case-insensitive matching should be done by case-folding everything
  ! before processing starts
  
diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor

index f2e94545455972ba712c954d1d714b01db6d6ff3..91f6a45911cce51ada86bbd44f4c6896ec1e6edd 100644 (file)
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@@ -2,8 +2,9 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: combinators.short-circuit unicode.categories kernel math
  combinators splitting sequences math.parser io.files io assocs
-arrays namespaces make math.ranges unicode.normalize.private values
-io.encodings.ascii unicode.syntax unicode.data compiler.units fry
+arrays namespaces make math.ranges unicode.normalize
+unicode.normalize.private values io.encodings.ascii
+unicode.syntax unicode.data compiler.units fry
  alien.syntax sets accessors interval-maps memoize locals words ;
  IN: unicode.breaks
  
@@ -126,7 +127,7 @@ to: grapheme-table
  
  VALUE: word-break-table
  
-"vocab:unicode/data/WordBreakProperty.txt" load-script
+"vocab:unicode/data/WordBreakProperty.txt" load-key-value
  to: word-break-table
  
  C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
diff --git a/basis/unicode/case/case-tests.factor b/basis/unicode/case/case-tests.factor

index 52a8d9755eb2ef7a99afbe90b0816e2f1c8a07be..a76f5e78c408c3a1cd8c7955db87d9828d75dc7b 100644 (file)
--- a/basis/unicode/case/case-tests.factor
+++ b/basis/unicode/case/case-tests.factor
@@ -1,6 +1,7 @@
  ! Copyright (C) 2008, 2009 Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.case unicode.case.private tools.test namespaces strings unicode.normalize ;
+USING: unicode.case tools.test namespaces strings unicode.normalize
+unicode.case.private ;
  IN: unicode.case.tests
  
  \ >upper must-infer
diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor

index c75582dacd82a5497ca91c43fdaf391adaf91983..fa842b8b818a1bed743ea5e46c647877f4c7469d 100644 (file)
--- a/basis/unicode/case/case.factor
+++ b/basis/unicode/case/case.factor
@@ -7,12 +7,6 @@ strings splitting kernel accessors unicode.breaks fry locals ;
  QUALIFIED: ascii
  IN: unicode.case
  
-<PRIVATE
-: ch>lower ( ch -- lower ) simple-lower at-default ; inline
-: ch>upper ( ch -- upper ) simple-upper at-default ; inline
-: ch>title ( ch -- title ) simple-title at-default ; inline
-PRIVATE>
-
  SYMBOL: locale ! Just casing locale, or overall?
  
  <PRIVATE
@@ -86,7 +80,7 @@ SYMBOL: locale ! Just casing locale, or overall?
  :: map-case ( string string-quot char-quot -- case )
      string length <sbuf> :> out
      string [
-        dup special-casing at
+        dup special-case
          [ string-quot call out push-all ]
          [ char-quot call out push ] ?if
      ] each out "" like ; inline
diff --git a/basis/unicode/data/data-docs.factor b/basis/unicode/data/data-docs.factor

index 0123c98a670ad601edc25b20ba7fc57a63dcca19..d1a458eb480066de5fe5bb8fd913f30375268156 100644 (file)
--- a/basis/unicode/data/data-docs.factor
+++ b/basis/unicode/data/data-docs.factor
@@ -13,7 +13,8 @@ ARTICLE: "unicode.data" "Unicode data tables"
  { $subsection non-starter? }
  { $subsection name>char }
  { $subsection char>name }
-{ $subsection property? } ;
+{ $subsection property? }
+{ $subsection load-key-value } ;
  
  HELP: canonical-entry
  { $values { "char" "a code point" } { "seq" string } }
@@ -46,3 +47,7 @@ HELP: name>char
  HELP: property?
  { $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
  { $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
+
+HELP: load-key-value
+{ $values { "filename" string } { "table" "an interval map" } }
+{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor

index a1f663d03a30eee34919aa940918a7313d3953f0..93df3d5a8c2e3cf02f4d8188c540b5916c381054 100644 (file)
--- a/basis/unicode/data/data.factor
+++ b/basis/unicode/data/data.factor
@@ -8,6 +8,8 @@ ascii sets combinators locals math.ranges sorting make
  strings.parser io.encodings.utf8 memoize ;
  IN: unicode.data
  
+<PRIVATE
+
  VALUE: simple-lower
  VALUE: simple-upper
  VALUE: simple-title
@@ -20,14 +22,20 @@ VALUE: name-map
  VALUE: special-casing
  VALUE: properties
  
-: canonical-entry ( char -- seq ) canonical-map at ;
-: combine-chars ( a b -- char/f ) combine-map hash2 ;
-: compatibility-entry ( char -- seq ) compatibility-map at  ;
-: combining-class ( char -- n ) class-map at ;
-: non-starter? ( char -- ? ) combining-class { 0 f } member? not ;
-: name>char ( name -- char ) name-map at ;
-: char>name ( char -- name ) name-map value-at ;
-: property? ( char property -- ? ) properties at interval-key? ;
+PRIVATE>
+
+: canonical-entry ( char -- seq ) canonical-map at ; inline
+: combine-chars ( a b -- char/f ) combine-map hash2 ; inline
+: compatibility-entry ( char -- seq ) compatibility-map at ; inline
+: combining-class ( char -- n ) class-map at ; inline
+: non-starter? ( char -- ? ) combining-class { 0 f } member? not ; inline
+: name>char ( name -- char ) name-map at ; inline
+: char>name ( char -- name ) name-map value-at ; inline
+: property? ( char property -- ? ) properties at interval-key? ; inline
+: ch>lower ( ch -- lower ) simple-lower at-default ; inline
+: ch>upper ( ch -- upper ) simple-upper at-default ; inline
+: ch>title ( ch -- title ) simple-title at-default ; inline
+: special-case ( ch -- casing-tuple ) special-casing at ; inline
  
  ! For non-existent characters, use Cn
  CONSTANT: categories
@@ -40,11 +48,15 @@ CONSTANT: categories
        "Zs" "Zl" "Zp"
        "Cc" "Cf" "Cs" "Co" }
  
+<PRIVATE
+
  MEMO: categories-map ( -- hashtable )
      categories <enum> [ swap ] H{ } assoc-map-as ;
  
  CONSTANT: num-chars HEX: 2FA1E
  
+PRIVATE>
+
  : category# ( char -- category )
      ! There are a few characters that should be Cn
      ! that this gives Cf or Mn
@@ -60,6 +72,8 @@ CONSTANT: num-chars HEX: 2FA1E
  : category ( char -- category )
      category# categories nth ;
  
+<PRIVATE
+
  ! Loading data from UnicodeData.txt
  
  : split-; ( line -- array )
@@ -155,10 +169,14 @@ CONSTANT: num-chars HEX: 2FA1E
  : multihex ( hexstring -- string )
      " " split [ hex> ] map sift ;
  
+PRIVATE>
+
  TUPLE: code-point lower title upper ;
  
  C: <code-point> code-point
  
+<PRIVATE
+
  : set-code-point ( seq -- )
      4 head [ multihex ] map first4
      <code-point> swap first set ;
@@ -212,3 +230,31 @@ load-properties to: properties
  
  [ name>char [ "Invalid character" throw ] unless* ]
  name>char-hook set-global
+
+SYMBOL: interned
+
+: parse-key-value ( filename -- assoc )
+    ! assoc is code point/range => name
+    ascii file-lines filter-comments [ split-; ] map ;
+
+: range, ( value key -- )
+    swap interned get
+    [ = ] with find nip 2array , ;
+
+: expand-ranges ( assoc -- interval-map )
+    [
+        [
+            swap CHAR: . over member? [
+                ".." split1 [ hex> ] bi@ 2array
+            ] [ hex> ] if range,
+        ] assoc-each
+    ] { } make <interval-map> ;
+
+: process-key-value ( ranges -- table )
+    dup values prune interned
+    [ expand-ranges ] with-variable ;
+
+PRIVATE>
+
+: load-key-value ( filename -- table )
+    parse-key-value process-key-value ;
diff --git a/basis/unicode/script/script.factor b/basis/unicode/script/script.factor

index c8f818dbaa226d065b5dc5f6d36f22974cc32da6..ed804760848a07c8c275b4e6dbce4c3d719678e4 100644 (file)
--- a/basis/unicode/script/script.factor
+++ b/basis/unicode/script/script.factor
@@ -9,35 +9,9 @@ IN: unicode.script
  
  <PRIVATE
  
-SYMBOL: interned
-
-: parse-script ( filename -- assoc )
-    ! assoc is code point/range => name
-    ascii file-lines filter-comments [ split-; ] map ;
-
-: range, ( value key -- )
-    swap interned get
-    [ = ] with find nip 2array , ;
-
-: expand-ranges ( assoc -- interval-map )
-    [
-        [
-            swap CHAR: . over member? [
-                ".." split1 [ hex> ] bi@ 2array
-            ] [ hex> ] if range,
-        ] assoc-each
-    ] { } make <interval-map> ;
-
-: process-script ( ranges -- table )
-    dup values prune interned
-    [ expand-ranges ] with-variable ;
-
-: load-script ( filename -- table )
-    parse-script process-script ;
-
  VALUE: script-table
  
-"vocab:unicode/script/Scripts.txt" load-script
+"vocab:unicode/script/Scripts.txt" load-key-value
  to: script-table
  
  PRIVATE>
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Thu, 19 Mar 2009 02:04:36 +0000 (21:04 -0500)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Thu, 19 Mar 2009 02:04:36 +0000 (21:04 -0500)
basis/regexp/nfa/nfa.factor		patch \| blob \| history
basis/unicode/breaks/breaks.factor		patch \| blob \| history
basis/unicode/case/case-tests.factor		patch \| blob \| history
basis/unicode/case/case.factor		patch \| blob \| history
basis/unicode/data/data-docs.factor		patch \| blob \| history
basis/unicode/data/data.factor		patch \| blob \| history
basis/unicode/script/script.factor		patch \| blob \| history