basis/unicode/data/data.factor

   1 ! Copyright (C) 2008, 2009 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: arrays ascii assocs byte-arrays combinators
   4 combinators.short-circuit grouping hashtables interval-sets
   5 io.encodings.utf8 io.files kernel make math math.bitwise
   6 math.order math.parser ranges namespaces sequences
   7 sets simple-flat-file sorting splitting strings.parser ;
   8 IN: unicode.data
   9
  10 <PRIVATE
  11
  12 CONSTANT: simple-lower H{ }
  13 CONSTANT: simple-upper H{ }
  14 CONSTANT: simple-title H{ }
  15 CONSTANT: canonical-map H{ }
  16 CONSTANT: combine-map H{ }
  17 CONSTANT: class-map H{ }
  18 CONSTANT: compatibility-map H{ }
  19 CONSTANT: category-map BV{ }
  20 CONSTANT: special-casing H{ }
  21 CONSTANT: properties H{ }
  22
  23 : >2ch ( a b -- c ) [ 21 shift ] dip + ; inline
  24 : 2ch> ( c -- a b ) [ -21 shift ] [ 21 on-bits mask ] bi ; inline
  25
  26 PRIVATE>
  27
  28 CONSTANT: name-map H{ }
  29
  30 : canonical-entry ( char -- seq ) canonical-map at ; inline
  31 : compatibility-entry ( char -- seq ) compatibility-map at ; inline
  32 : combine-chars ( a b -- char/f ) >2ch combine-map at ; inline
  33 : combining-class ( char -- n ) class-map at ; inline
  34 : non-starter? ( char -- ? ) combining-class { 0 f } member? not ; inline
  35 : property ( property -- interval-map ) properties at ; foldable
  36 : property? ( char property -- ? ) property interval-in? ; inline
  37 : special-case ( ch -- casing-tuple ) special-casing at ; inline
  38
  39 ! For non-existent characters, use Cn
  40 CONSTANT: categories {
  41     "Cn"
  42     "Lu" "Ll" "Lt" "Lm" "Lo"
  43     "Mn" "Mc" "Me"
  44     "Nd" "Nl" "No"
  45     "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
  46     "Sm" "Sc" "Sk" "So"
  47     "Zs" "Zl" "Zp"
  48     "Cc" "Cf" "Cs" "Co"
  49 }
  50
  51 <PRIVATE
  52
  53 MEMO: categories-map ( -- hashtable )
  54     categories H{ } zip-index-as ;
  55
  56 CONSTANT: NUM-CHARS 0x2FA1E
  57
  58 PRIVATE>
  59
  60 : category-num ( char -- n )
  61     ! There are a few characters that should be Cn
  62     ! that this gives Cf or Mn
  63     ! Cf = 26; Mn = 5; Cn = 29
  64     ! Use a compressed array instead?
  65     dup category-map ?nth [ ] [
  66         dup 0xE0001 0xE007F between?
  67         [ drop 26 ] [
  68             0xE0100 0xE01EF between?  5 29 ?
  69         ] if
  70     ] ?if ; inline
  71
  72 : category ( char -- category )
  73     category-num categories nth ;
  74
  75 <PRIVATE
  76
  77 ! Loading data from UnicodeData.txt
  78
  79 : load-unicode-data ( -- data )
  80     "vocab:unicode/UCD/UnicodeData.txt" load-data-file ;
  81
  82 : (process-data) ( index data -- newdata )
  83     [ [ nth ] keep first swap ] with { } map>assoc
  84     [ [ hex> ] dip ] assoc-map ;
  85
  86 : process-data ( index data -- hash )
  87     (process-data) [ hex> ] assoc-map [ nip ] H{ } assoc-filter-as ;
  88
  89 : (chain-decomposed) ( hash value -- newvalue )
  90     [
  91         2dup of
  92         [ (chain-decomposed) ] [ 1array nip ] ?if
  93     ] with map concat ;
  94
  95 : chain-decomposed ( hash -- newhash )
  96     dup [ swap (chain-decomposed) ] curry assoc-map ;
  97
  98 : first* ( seq -- ? )
  99     second { [ empty? ] [ first ] } 1|| ;
 100
 101 : (process-decomposed) ( data -- alist )
 102     5 swap (process-data)
 103     [ split-words [ hex> ] map ] assoc-map ;
 104
 105 : exclusions-file ( -- filename )
 106     "vocab:unicode/UCD/CompositionExclusions.txt" ;
 107
 108 : exclusions ( -- set )
 109     exclusions-file utf8 file-lines
 110     [ "#" split1 drop [ ascii:blank? ] trim-tail hex> ] map
 111     0 swap remove ;
 112
 113 : unique ( seq -- assoc )
 114     [ dup ] H{ } map>assoc ;
 115
 116 : remove-exclusions ( alist -- alist )
 117     exclusions unique assoc-diff ;
 118
 119 : process-canonical ( data -- hash hash )
 120     (process-decomposed) [ first* ] filter
 121     [
 122         [ second length 2 = ] filter remove-exclusions
 123         [ first2 >2ch swap ] H{ } assoc-map-as
 124     ] [ >hashtable chain-decomposed ] bi ;
 125
 126 : process-compatibility ( data -- hash )
 127     (process-decomposed)
 128     [ dup first* [ first2 rest 2array ] unless ] map
 129     [ second empty? ] reject
 130     >hashtable chain-decomposed ;
 131
 132 : process-combining ( data -- hash )
 133     3 swap (process-data)
 134     [ string>number ] assoc-map
 135     [ nip zero? ] assoc-reject
 136     >hashtable ;
 137
 138 ! the maximum unicode char in the first 3 planes
 139
 140 :: fill-ranges ( table -- table )
 141     name-map sort-values keys
 142     [ { [ "first>" tail? ] [ "last>" tail? ] } 1|| ] filter
 143     2 group [
 144         [ name-map at ] bi@ [ [a..b] ] [ table ?nth ] bi
 145         [ swap table ?set-nth ] curry each
 146     ] assoc-each table ;
 147
 148 :: process-category ( data -- category-listing )
 149     NUM-CHARS <byte-array> :> table
 150     2 data (process-data) [| char cat |
 151         cat categories-map at char table ?set-nth
 152     ] assoc-each table fill-ranges ;
 153
 154 : process-names ( data -- names-hash )
 155     1 swap (process-data) [
 156         >lower H{ { CHAR: \s CHAR: - } } substitute swap
 157     ] H{ } assoc-map-as ;
 158
 159 : multihex ( hexstring -- string )
 160     split-words [ hex> ] map sift ;
 161
 162 PRIVATE>
 163
 164 TUPLE: code-point lower title upper ;
 165
 166 C: <code-point> code-point
 167
 168 <PRIVATE
 169
 170 : set-code-point ( seq -- )
 171     4 head [ multihex ] map first4
 172     <code-point> swap first ,, ;
 173
 174 ! Extra properties {{[a,b],prop}}
 175 : parse-properties ( -- assoc )
 176     "vocab:unicode/UCD/PropList.txt" load-data-file [
 177         [
 178             ".." split1 [ dup ] unless*
 179             [ hex> ] bi@ 2array
 180         ] dip
 181     ] assoc-map ;
 182
 183 : properties>intervals ( properties -- assoc[str,interval] )
 184     dup values members [ f ] H{ } map>assoc
 185     [ [ push-at ] curry assoc-each ] keep
 186     [ <interval-set> ] assoc-map ;
 187
 188 : load-properties ( -- assoc )
 189     parse-properties properties>intervals ;
 190
 191 ! Special casing data
 192 : load-special-casing ( -- special-casing )
 193     "vocab:unicode/UCD/SpecialCasing.txt" load-data-file
 194     [ length 5 = ] filter
 195     [ [ set-code-point ] each ] H{ } make ;
 196
 197 load-unicode-data {
 198     [ process-names name-map swap assoc-union! drop ]
 199     [ 13 swap process-data simple-lower swap assoc-union! drop ]
 200     [ 12 swap process-data simple-upper swap assoc-union! drop ]
 201     [ 14 swap process-data simple-upper assoc-union simple-title swap assoc-union! drop ]
 202     [ process-combining class-map swap assoc-union! drop ]
 203     [ process-canonical canonical-map swap assoc-union! drop combine-map swap assoc-union! drop ]
 204     [ process-compatibility compatibility-map swap assoc-union! drop ]
 205     [ process-category category-map push-all ]
 206 } cleave
 207
 208 combine-map keys [ 2ch> nip ] map
 209 [ class-map at ] reject
 210 [ 0 swap class-map set-at ] each
 211
 212 load-special-casing special-casing swap assoc-union! drop
 213
 214 load-properties properties swap assoc-union! drop
 215
 216 PRIVATE>
 217
 218 ERROR: invalid-unicode-character name ;
 219
 220 [
 221     name-map ?at [ invalid-unicode-character ] unless
 222 ] name>char-hook set-global