basis/unicode/data/data.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: combinators.short-circuit assocs math kernel sequences
   4 io.files hashtables quotations splitting grouping arrays io
   5 math.parser hash2 math.order byte-arrays words namespaces words
   6 compiler.units parser io.encodings.ascii values interval-maps
   7 ascii sets combinators locals math.ranges sorting make
   8 strings.parser io.encodings.utf8 ;
   9 IN: unicode.data
  10
  11 VALUE: simple-lower
  12 VALUE: simple-upper
  13 VALUE: simple-title
  14 VALUE: canonical-map
  15 VALUE: combine-map
  16 VALUE: class-map
  17 VALUE: compatibility-map
  18 VALUE: category-map
  19 VALUE: name-map
  20 VALUE: special-casing
  21 VALUE: properties
  22
  23 : canonical-entry ( char -- seq ) canonical-map at ;
  24 : combine-chars ( a b -- char/f ) combine-map hash2 ;
  25 : compatibility-entry ( char -- seq ) compatibility-map at  ;
  26 : combining-class ( char -- n ) class-map at ;
  27 : non-starter? ( char -- ? ) combining-class { 0 f } member? not ;
  28 : name>char ( name -- char ) name-map at ;
  29 : char>name ( char -- name ) name-map value-at ;
  30 : property? ( char property -- ? ) properties at interval-key? ;
  31
  32 ! Loading data from UnicodeData.txt
  33
  34 : split-; ( line -- array )
  35     ";" split [ [ blank? ] trim ] map ;
  36
  37 : data ( filename -- data )
  38     ascii file-lines [ split-; ] map ;
  39
  40 : load-data ( -- data )
  41     "vocab:unicode/data/UnicodeData.txt" data ;
  42
  43 : filter-comments ( lines -- lines )
  44     [ "#@" split first ] map harvest ;
  45
  46 : (process-data) ( index data -- newdata )
  47     filter-comments
  48     [ [ nth ] keep first swap ] with { } map>assoc
  49     [ [ hex> ] dip ] assoc-map ;
  50
  51 : process-data ( index data -- hash )
  52     (process-data) [ hex> ] assoc-map [ nip ] assoc-filter >hashtable ;
  53
  54 : (chain-decomposed) ( hash value -- newvalue )
  55     [
  56         2dup swap at
  57         [ (chain-decomposed) ] [ 1array nip ] ?if
  58     ] with map concat ;
  59
  60 : chain-decomposed ( hash -- newhash )
  61     dup [ swap (chain-decomposed) ] curry assoc-map ;
  62
  63 : first* ( seq -- ? )
  64     second { [ empty? ] [ first ] } 1|| ;
  65
  66 : (process-decomposed) ( data -- alist )
  67     5 swap (process-data)
  68     [ " " split [ hex> ] map ] assoc-map ;
  69
  70 : exclusions-file ( -- filename )
  71     "vocab:unicode/data/CompositionExclusions.txt" ;
  72
  73 : exclusions ( -- set )
  74     exclusions-file utf8 file-lines
  75     [ "#" split1 drop [ blank? ] trim-tail hex> ] map harvest ;
  76
  77 : remove-exclusions ( alist -- alist )
  78     exclusions [ dup ] H{ } map>assoc assoc-diff ;
  79
  80 : process-canonical ( data -- hash2 hash )
  81     (process-decomposed) [ first* ] filter
  82     [
  83         [ second length 2 = ] filter remove-exclusions
  84         ! using 1009 as the size, the maximum load is 4
  85         [ first2 first2 rot 3array ] map 1009 alist>hash2
  86     ] [ >hashtable chain-decomposed ] bi ;
  87
  88 : process-compatibility ( data -- hash )
  89     (process-decomposed)
  90     [ dup first* [ first2 rest 2array ] unless ] map
  91     [ second empty? not ] filter
  92     >hashtable chain-decomposed ;
  93
  94 : process-combining ( data -- hash )
  95     3 swap (process-data)
  96     [ string>number ] assoc-map
  97     [ nip zero? not ] assoc-filter
  98     >hashtable ;
  99
 100 : categories ( -- names )
 101     ! For non-existent characters, use Cn
 102     { "Cn"
 103       "Lu" "Ll" "Lt" "Lm" "Lo"
 104       "Mn" "Mc" "Me"
 105       "Nd" "Nl" "No"
 106       "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
 107       "Sm" "Sc" "Sk" "So"
 108       "Zs" "Zl" "Zp"
 109       "Cc" "Cf" "Cs" "Co" } ;
 110
 111 : num-chars HEX: 2FA1E ;
 112
 113 ! the maximum unicode char in the first 3 planes
 114
 115 : ?set-nth ( val index seq -- )
 116     2dup bounds-check? [ set-nth ] [ 3drop ] if ;
 117
 118 :: fill-ranges ( table -- table )
 119     name-map >alist sort-values keys
 120     [ { [ "first>" tail? ] [ "last>" tail? ] } 1|| ] filter
 121     2 group [
 122         [ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
 123         [ swap table ?set-nth ] curry each
 124     ] assoc-each table ;
 125
 126 :: process-category ( data -- category-listing )
 127     [let | table [ num-chars <byte-array> ] |
 128         2 data (process-data) [| char cat |
 129             cat categories index char table ?set-nth
 130         ] assoc-each table fill-ranges ] ;
 131
 132 : process-names ( data -- names-hash )
 133     1 swap (process-data) [
 134         >lower { { CHAR: \s CHAR: - } } substitute swap
 135     ] H{ } assoc-map-as ;
 136
 137 : multihex ( hexstring -- string )
 138     " " split [ hex> ] map sift ;
 139
 140 TUPLE: code-point lower title upper ;
 141
 142 C: <code-point> code-point
 143
 144 : set-code-point ( seq -- )
 145     4 head [ multihex ] map first4
 146     <code-point> swap first set ;
 147
 148 ! Extra properties
 149 : properties-lines ( -- lines )
 150     "vocab:unicode/data/PropList.txt"
 151     ascii file-lines ;
 152
 153 : parse-properties ( -- {{[a,b],prop}} )
 154     properties-lines filter-comments [
 155         split-; first2
 156         [ ".." split1 [ dup ] unless* [ hex> ] bi@ 2array ] dip
 157     ] { } map>assoc ;
 158
 159 : properties>intervals ( properties -- assoc[str,interval] )
 160     dup values prune [ f ] H{ } map>assoc
 161     [ [ push-at ] curry assoc-each ] keep
 162     [ <interval-set> ] assoc-map ;
 163
 164 : load-properties ( -- assoc )
 165     parse-properties properties>intervals ;
 166
 167 ! Special casing data
 168 : load-special-casing ( -- special-casing )
 169     "vocab:unicode/data/SpecialCasing.txt" data
 170     [ length 5 = ] filter
 171     [ [ set-code-point ] each ] H{ } make-assoc ;
 172
 173 load-data {
 174     [ process-names to: name-map ]
 175     [ 13 swap process-data to: simple-lower ]
 176     [ 12 swap process-data to: simple-upper ]
 177     [ 14 swap process-data simple-upper assoc-union to: simple-title ]
 178     [ process-combining to: class-map ]
 179     [ process-canonical to: canonical-map to: combine-map ]
 180     [ process-compatibility to: compatibility-map ]
 181     [ process-category to: category-map ]
 182 } cleave
 183
 184 : postprocess-class ( -- )
 185     combine-map [ [ second ] map ] map concat
 186     [ combining-class not ] filter
 187     [ 0 swap class-map set-at ] each ;
 188
 189 postprocess-class
 190
 191 load-special-casing to: special-casing
 192
 193 load-properties to: properties
 194
 195 ! Utility to load resource files that look like Scripts.txt
 196
 197 SYMBOL: interned
 198
 199 : parse-script ( filename -- assoc )
 200     ! assoc is code point/range => name
 201     ascii file-lines filter-comments [ split-; ] map ;
 202
 203 : range, ( value key -- )
 204     swap interned get
 205     [ = ] with find nip 2array , ;
 206
 207 : expand-ranges ( assoc -- interval-map )
 208     [
 209         [
 210             swap CHAR: . over member? [
 211                 ".." split1 [ hex> ] bi@ 2array
 212             ] [ hex> ] if range,
 213         ] assoc-each
 214     ] { } make <interval-map> ;
 215
 216 : process-script ( ranges -- table )
 217     dup values prune interned
 218     [ expand-ranges ] with-variable ;
 219
 220 : load-script ( filename -- table )
 221     parse-script process-script ;
 222
 223 [ name>char [ "Invalid character" throw ] unless* ]
 224 name>char-hook set-global