basis/unicode/data/data.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: combinators.short-circuit assocs math kernel sequences
   4 io.files hashtables quotations splitting grouping arrays io
   5 math.parser hash2 math.order byte-arrays words namespaces words
   6 compiler.units parser io.encodings.ascii values interval-maps
   7 ascii sets combinators locals math.ranges sorting make
   8 strings.parser io.encodings.utf8 memoize ;
   9 IN: unicode.data
  10
  11 VALUE: simple-lower
  12 VALUE: simple-upper
  13 VALUE: simple-title
  14 VALUE: canonical-map
  15 VALUE: combine-map
  16 VALUE: class-map
  17 VALUE: compatibility-map
  18 VALUE: category-map
  19 VALUE: name-map
  20 VALUE: special-casing
  21 VALUE: properties
  22
  23 : canonical-entry ( char -- seq ) canonical-map at ;
  24 : combine-chars ( a b -- char/f ) combine-map hash2 ;
  25 : compatibility-entry ( char -- seq ) compatibility-map at  ;
  26 : combining-class ( char -- n ) class-map at ;
  27 : non-starter? ( char -- ? ) combining-class { 0 f } member? not ;
  28 : name>char ( name -- char ) name-map at ;
  29 : char>name ( char -- name ) name-map value-at ;
  30 : property? ( char property -- ? ) properties at interval-key? ;
  31
  32 ! Loading data from UnicodeData.txt
  33
  34 : split-; ( line -- array )
  35     ";" split [ [ blank? ] trim ] map ;
  36
  37 : data ( filename -- data )
  38     ascii file-lines [ split-; ] map ;
  39
  40 : load-data ( -- data )
  41     "vocab:unicode/data/UnicodeData.txt" data ;
  42
  43 : filter-comments ( lines -- lines )
  44     [ "#@" split first ] map harvest ;
  45
  46 : (process-data) ( index data -- newdata )
  47     filter-comments
  48     [ [ nth ] keep first swap ] with { } map>assoc
  49     [ [ hex> ] dip ] assoc-map ;
  50
  51 : process-data ( index data -- hash )
  52     (process-data) [ hex> ] assoc-map [ nip ] assoc-filter >hashtable ;
  53
  54 : (chain-decomposed) ( hash value -- newvalue )
  55     [
  56         2dup swap at
  57         [ (chain-decomposed) ] [ 1array nip ] ?if
  58     ] with map concat ;
  59
  60 : chain-decomposed ( hash -- newhash )
  61     dup [ swap (chain-decomposed) ] curry assoc-map ;
  62
  63 : first* ( seq -- ? )
  64     second { [ empty? ] [ first ] } 1|| ;
  65
  66 : (process-decomposed) ( data -- alist )
  67     5 swap (process-data)
  68     [ " " split [ hex> ] map ] assoc-map ;
  69
  70 : exclusions-file ( -- filename )
  71     "vocab:unicode/data/CompositionExclusions.txt" ;
  72
  73 : exclusions ( -- set )
  74     exclusions-file utf8 file-lines
  75     [ "#" split1 drop [ blank? ] trim-tail hex> ] map harvest ;
  76
  77 : remove-exclusions ( alist -- alist )
  78     exclusions [ dup ] H{ } map>assoc assoc-diff ;
  79
  80 : process-canonical ( data -- hash2 hash )
  81     (process-decomposed) [ first* ] filter
  82     [
  83         [ second length 2 = ] filter remove-exclusions
  84         ! using 1009 as the size, the maximum load is 4
  85         [ first2 first2 rot 3array ] map 1009 alist>hash2
  86     ] [ >hashtable chain-decomposed ] bi ;
  87
  88 : process-compatibility ( data -- hash )
  89     (process-decomposed)
  90     [ dup first* [ first2 rest 2array ] unless ] map
  91     [ second empty? not ] filter
  92     >hashtable chain-decomposed ;
  93
  94 : process-combining ( data -- hash )
  95     3 swap (process-data)
  96     [ string>number ] assoc-map
  97     [ nip zero? not ] assoc-filter
  98     >hashtable ;
  99
 100 ! For non-existent characters, use Cn
 101 CONSTANT: categories
 102     { "Cn"
 103       "Lu" "Ll" "Lt" "Lm" "Lo"
 104       "Mn" "Mc" "Me"
 105       "Nd" "Nl" "No"
 106       "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
 107       "Sm" "Sc" "Sk" "So"
 108       "Zs" "Zl" "Zp"
 109       "Cc" "Cf" "Cs" "Co" }
 110
 111 MEMO: categories-map ( -- hashtable )
 112     categories <enum> [ swap ] H{ } assoc-map-as ;
 113
 114 CONSTANT: num-chars HEX: 2FA1E
 115
 116 ! the maximum unicode char in the first 3 planes
 117
 118 : ?set-nth ( val index seq -- )
 119     2dup bounds-check? [ set-nth ] [ 3drop ] if ;
 120
 121 :: fill-ranges ( table -- table )
 122     name-map >alist sort-values keys
 123     [ { [ "first>" tail? ] [ "last>" tail? ] } 1|| ] filter
 124     2 group [
 125         [ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
 126         [ swap table ?set-nth ] curry each
 127     ] assoc-each table ;
 128
 129 :: process-category ( data -- category-listing )
 130     num-chars <byte-array> :> table
 131     2 data (process-data) [| char cat |
 132         cat categories-map at char table ?set-nth
 133     ] assoc-each table fill-ranges ;
 134
 135 : process-names ( data -- names-hash )
 136     1 swap (process-data) [
 137         >lower { { CHAR: \s CHAR: - } } substitute swap
 138     ] H{ } assoc-map-as ;
 139
 140 : multihex ( hexstring -- string )
 141     " " split [ hex> ] map sift ;
 142
 143 TUPLE: code-point lower title upper ;
 144
 145 C: <code-point> code-point
 146
 147 : set-code-point ( seq -- )
 148     4 head [ multihex ] map first4
 149     <code-point> swap first set ;
 150
 151 ! Extra properties
 152 : properties-lines ( -- lines )
 153     "vocab:unicode/data/PropList.txt"
 154     ascii file-lines ;
 155
 156 : parse-properties ( -- {{[a,b],prop}} )
 157     properties-lines filter-comments [
 158         split-; first2
 159         [ ".." split1 [ dup ] unless* [ hex> ] bi@ 2array ] dip
 160     ] { } map>assoc ;
 161
 162 : properties>intervals ( properties -- assoc[str,interval] )
 163     dup values prune [ f ] H{ } map>assoc
 164     [ [ push-at ] curry assoc-each ] keep
 165     [ <interval-set> ] assoc-map ;
 166
 167 : load-properties ( -- assoc )
 168     parse-properties properties>intervals ;
 169
 170 ! Special casing data
 171 : load-special-casing ( -- special-casing )
 172     "vocab:unicode/data/SpecialCasing.txt" data
 173     [ length 5 = ] filter
 174     [ [ set-code-point ] each ] H{ } make-assoc ;
 175
 176 load-data {
 177     [ process-names to: name-map ]
 178     [ 13 swap process-data to: simple-lower ]
 179     [ 12 swap process-data to: simple-upper ]
 180     [ 14 swap process-data simple-upper assoc-union to: simple-title ]
 181     [ process-combining to: class-map ]
 182     [ process-canonical to: canonical-map to: combine-map ]
 183     [ process-compatibility to: compatibility-map ]
 184     [ process-category to: category-map ]
 185 } cleave
 186
 187 : postprocess-class ( -- )
 188     combine-map [ [ second ] map ] map concat
 189     [ combining-class not ] filter
 190     [ 0 swap class-map set-at ] each ;
 191
 192 postprocess-class
 193
 194 load-special-casing to: special-casing
 195
 196 load-properties to: properties
 197
 198 ! Utility to load resource files that look like Scripts.txt
 199
 200 SYMBOL: interned
 201
 202 : parse-script ( filename -- assoc )
 203     ! assoc is code point/range => name
 204     ascii file-lines filter-comments [ split-; ] map ;
 205
 206 : range, ( value key -- )
 207     swap interned get
 208     [ = ] with find nip 2array , ;
 209
 210 : expand-ranges ( assoc -- interval-map )
 211     [
 212         [
 213             swap CHAR: . over member? [
 214                 ".." split1 [ hex> ] bi@ 2array
 215             ] [ hex> ] if range,
 216         ] assoc-each
 217     ] { } make <interval-map> ;
 218
 219 : process-script ( ranges -- table )
 220     dup values prune interned
 221     [ expand-ranges ] with-variable ;
 222
 223 : load-script ( filename -- table )
 224     parse-script process-script ;
 225
 226 [ name>char [ "Invalid character" throw ] unless* ]
 227 name>char-hook set-global