basis/unicode/data/data.factor

   1 ! Copyright (C) 2008, 2009 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: arrays ascii assocs byte-arrays combinators
   4 combinators.short-circuit grouping hashtables interval-sets
   5 io.encodings.utf8 io.files kernel locals make math math.bitwise
   6 math.order math.parser math.ranges memoize namespaces sequences
   7 sets simple-flat-file sorting splitting strings.parser ;
   8 IN: unicode.data
   9
  10 <PRIVATE
  11
  12 CONSTANT: simple-lower H{ }
  13 CONSTANT: simple-upper H{ }
  14 CONSTANT: simple-title H{ }
  15 CONSTANT: canonical-map H{ }
  16 CONSTANT: combine-map H{ }
  17 CONSTANT: class-map H{ }
  18 CONSTANT: compatibility-map H{ }
  19 CONSTANT: category-map BV{ }
  20 CONSTANT: special-casing H{ }
  21 CONSTANT: properties H{ }
  22
  23 : >2ch ( a b -- c ) [ 21 shift ] dip + ;
  24 : 2ch> ( c -- a b ) [ -21 shift ] [ 21 on-bits mask ] bi ;
  25
  26 PRIVATE>
  27
  28 CONSTANT: name-map H{ }
  29
  30 : canonical-entry ( char -- seq ) canonical-map at ; inline
  31 : compatibility-entry ( char -- seq ) compatibility-map at ; inline
  32 : combine-chars ( a b -- char/f ) >2ch combine-map at ; inline
  33 : combining-class ( char -- n ) class-map at ; inline
  34 : non-starter? ( char -- ? ) combining-class { 0 f } member? not ; inline
  35 : property ( property -- interval-map ) properties at ; foldable
  36 : property? ( char property -- ? ) property interval-sets:in? ; inline
  37 : special-case ( ch -- casing-tuple ) special-casing at ; inline
  38
  39 ! For non-existent characters, use Cn
  40 CONSTANT: categories {
  41     "Cn"
  42     "Lu" "Ll" "Lt" "Lm" "Lo"
  43     "Mn" "Mc" "Me"
  44     "Nd" "Nl" "No"
  45     "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
  46     "Sm" "Sc" "Sk" "So"
  47     "Zs" "Zl" "Zp"
  48     "Cc" "Cf" "Cs" "Co"
  49 }
  50
  51 <PRIVATE
  52
  53 MEMO: categories-map ( -- hashtable )
  54     categories <enum> [ swap ] H{ } assoc-map-as ;
  55
  56 CONSTANT: num-chars 0x2FA1E
  57
  58 PRIVATE>
  59
  60 : category# ( char -- n )
  61     ! There are a few characters that should be Cn
  62     ! that this gives Cf or Mn
  63     ! Cf = 26; Mn = 5; Cn = 29
  64     ! Use a compressed array instead?
  65     dup category-map ?nth [ ] [
  66         dup 0xE0001 0xE007F between?
  67         [ drop 26 ] [
  68             0xE0100 0xE01EF between?  5 29 ?
  69         ] if
  70     ] ?if ; inline
  71
  72 : category ( char -- category )
  73     category# categories nth ;
  74
  75 <PRIVATE
  76
  77 ! Loading data from UnicodeData.txt
  78
  79 : load-data ( -- data )
  80     "vocab:unicode/data/UnicodeData.txt" data ;
  81
  82 : (process-data) ( index data -- newdata )
  83     [ [ nth ] keep first swap ] with { } map>assoc
  84     [ [ hex> ] dip ] assoc-map ;
  85
  86 : process-data ( index data -- hash )
  87     (process-data) [ hex> ] assoc-map [ nip ] H{ } assoc-filter-as ;
  88
  89 : (chain-decomposed) ( hash value -- newvalue )
  90     [
  91         2dup of
  92         [ (chain-decomposed) ] [ 1array nip ] ?if
  93     ] with map concat ;
  94
  95 : chain-decomposed ( hash -- newhash )
  96     dup [ swap (chain-decomposed) ] curry assoc-map ;
  97
  98 : first* ( seq -- ? )
  99     second { [ empty? ] [ first ] } 1|| ;
 100
 101 : (process-decomposed) ( data -- alist )
 102     5 swap (process-data)
 103     [ " " split [ hex> ] map ] assoc-map ;
 104
 105 : exclusions-file ( -- filename )
 106     "vocab:unicode/data/CompositionExclusions.txt" ;
 107
 108 : exclusions ( -- set )
 109     exclusions-file utf8 file-lines
 110     [ "#" split1 drop [ blank? ] trim-tail hex> ] map
 111     [ 0 = ] reject ;
 112
 113 : remove-exclusions ( alist -- alist )
 114     exclusions unique assoc-diff ;
 115
 116 : process-canonical ( data -- hash hash )
 117     (process-decomposed) [ first* ] filter
 118     [
 119         [ second length 2 = ] filter remove-exclusions
 120         [ first2 >2ch swap ] H{ } assoc-map-as
 121     ] [ >hashtable chain-decomposed ] bi ;
 122
 123 : process-compatibility ( data -- hash )
 124     (process-decomposed)
 125     [ dup first* [ first2 rest 2array ] unless ] map
 126     [ second empty? ] reject
 127     >hashtable chain-decomposed ;
 128
 129 : process-combining ( data -- hash )
 130     3 swap (process-data)
 131     [ string>number ] assoc-map
 132     [ nip zero? ] assoc-reject
 133     >hashtable ;
 134
 135 ! the maximum unicode char in the first 3 planes
 136
 137 :: fill-ranges ( table -- table )
 138     name-map sort-values keys
 139     [ { [ "first>" tail? ] [ "last>" tail? ] } 1|| ] filter
 140     2 group [
 141         [ name-map at ] bi@ [ [a,b] ] [ table ?nth ] bi
 142         [ swap table ?set-nth ] curry each
 143     ] assoc-each table ;
 144
 145 :: process-category ( data -- category-listing )
 146     num-chars <byte-array> :> table
 147     2 data (process-data) [| char cat |
 148         cat categories-map at char table ?set-nth
 149     ] assoc-each table fill-ranges ;
 150
 151 : process-names ( data -- names-hash )
 152     1 swap (process-data) [
 153         >lower H{ { CHAR: \s CHAR: - } } substitute swap
 154     ] H{ } assoc-map-as ;
 155
 156 : multihex ( hexstring -- string )
 157     " " split [ hex> ] map sift ;
 158
 159 PRIVATE>
 160
 161 TUPLE: code-point lower title upper ;
 162
 163 C: <code-point> code-point
 164
 165 <PRIVATE
 166
 167 : set-code-point ( seq -- )
 168     4 head [ multihex ] map first4
 169     <code-point> swap first ,, ;
 170
 171 ! Extra properties {{[a,b],prop}}
 172 : parse-properties ( -- assoc )
 173     "vocab:unicode/data/PropList.txt" data [
 174         [
 175             ".." split1 [ dup ] unless*
 176             [ hex> ] bi@ 2array
 177         ] dip
 178     ] assoc-map ;
 179
 180 : properties>intervals ( properties -- assoc[str,interval] )
 181     dup values members [ f ] H{ } map>assoc
 182     [ [ push-at ] curry assoc-each ] keep
 183     [ <interval-set> ] assoc-map ;
 184
 185 : load-properties ( -- assoc )
 186     parse-properties properties>intervals ;
 187
 188 ! Special casing data
 189 : load-special-casing ( -- special-casing )
 190     "vocab:unicode/data/SpecialCasing.txt" data
 191     [ length 5 = ] filter
 192     [ [ set-code-point ] each ] H{ } make ;
 193
 194 load-data {
 195     [ process-names name-map swap assoc-union! drop ]
 196     [ 13 swap process-data simple-lower swap assoc-union! drop ]
 197     [ 12 swap process-data simple-upper swap assoc-union! drop ]
 198     [ 14 swap process-data simple-upper assoc-union simple-title swap assoc-union! drop ]
 199     [ process-combining class-map swap assoc-union! drop ]
 200     [ process-canonical canonical-map swap assoc-union! drop combine-map swap assoc-union! drop ]
 201     [ process-compatibility compatibility-map swap assoc-union! drop ]
 202     [ process-category category-map push-all ]
 203 } cleave
 204
 205 combine-map keys [ 2ch> nip ] map
 206 [ class-map at ] reject
 207 [ 0 swap class-map set-at ] each
 208
 209 load-special-casing special-casing swap assoc-union! drop
 210
 211 load-properties properties swap assoc-union! drop
 212
 213 PRIVATE>
 214
 215 [
 216     name-map at [ "Invalid character" throw ] unless*
 217 ] name>char-hook set-global