basis/unicode/data/data.factor

   1 USING: combinators.short-circuit assocs math kernel sequences
   2 io.files hashtables quotations splitting grouping arrays
   3 math.parser hash2 math.order byte-arrays words namespaces words
   4 compiler.units parser io.encodings.ascii values interval-maps
   5 ascii sets combinators locals math.ranges sorting ;
   6 IN: unicode.data
   7
   8 VALUE: simple-lower
   9 VALUE: simple-upper
  10 VALUE: simple-title
  11 VALUE: canonical-map
  12 VALUE: combine-map
  13 VALUE: class-map
  14 VALUE: compatibility-map
  15 VALUE: category-map
  16 VALUE: name-map
  17 VALUE: special-casing
  18 VALUE: properties
  19
  20 : canonical-entry ( char -- seq ) canonical-map at ;
  21 : combine-chars ( a b -- char/f ) combine-map hash2 ;
  22 : compatibility-entry ( char -- seq ) compatibility-map at  ;
  23 : combining-class ( char -- n ) class-map at ;
  24 : non-starter? ( char -- ? ) class-map key? ;
  25 : name>char ( string -- char ) name-map at ;
  26 : char>name ( char -- string ) name-map value-at ;
  27 : property? ( char property -- ? ) properties at interval-key? ;
  28
  29 ! Convenience functions
  30 : ?between? ( n/f from to -- ? )
  31     pick [ between? ] [ 3drop f ] if ;
  32
  33 ! Loading data from UnicodeData.txt
  34
  35 : split-; ( line -- array )
  36     ";" split [ [ blank? ] trim ] map ;
  37
  38 : data ( filename -- data )
  39     ascii file-lines [ split-; ] map ;
  40
  41 : load-data ( -- data )
  42     "resource:basis/unicode/data/UnicodeData.txt" data ;
  43
  44 : filter-comments ( lines -- lines )
  45     [ "#@" split first ] map harvest ;
  46
  47 : (process-data) ( index data -- newdata )
  48     filter-comments
  49     [ [ nth ] keep first swap ] with { } map>assoc
  50     [ >r hex> r> ] assoc-map ;
  51
  52 : process-data ( index data -- hash )
  53     (process-data) [ hex> ] assoc-map [ nip ] assoc-filter >hashtable ;
  54
  55 : (chain-decomposed) ( hash value -- newvalue )
  56     [
  57         2dup swap at
  58         [ (chain-decomposed) ] [ 1array nip ] ?if
  59     ] with map concat ;
  60
  61 : chain-decomposed ( hash -- newhash )
  62     dup [ swap (chain-decomposed) ] curry assoc-map ;
  63
  64 : first* ( seq -- ? )
  65     second { [ empty? ] [ first ] } 1|| ;
  66
  67 : (process-decomposed) ( data -- alist )
  68     5 swap (process-data)
  69     [ " " split [ hex> ] map ] assoc-map ;
  70
  71 : process-canonical ( data -- hash2 hash )
  72     (process-decomposed) [ first* ] filter
  73     [
  74         [ second length 2 = ] filter
  75         ! using 1009 as the size, the maximum load is 4
  76         [ first2 first2 rot 3array ] map 1009 alist>hash2
  77     ] [ >hashtable chain-decomposed ] bi ;
  78
  79 : process-compatibility ( data -- hash )
  80     (process-decomposed)
  81     [ dup first* [ first2 rest 2array ] unless ] map
  82     [ second empty? not ] filter
  83     >hashtable chain-decomposed ;
  84
  85 : process-combining ( data -- hash )
  86     3 swap (process-data)
  87     [ string>number ] assoc-map
  88     [ nip zero? not ] assoc-filter
  89     >hashtable ;
  90
  91 : categories ( -- names )
  92     ! For non-existent characters, use Cn
  93     { "Cn"
  94       "Lu" "Ll" "Lt" "Lm" "Lo"
  95       "Mn" "Mc" "Me"
  96       "Nd" "Nl" "No"
  97       "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
  98       "Sm" "Sc" "Sk" "So"
  99       "Zs" "Zl" "Zp"
 100       "Cc" "Cf" "Cs" "Co" } ;
 101
 102 : num-chars HEX: 2FA1E ;
 103 ! the maximum unicode char in the first 3 planes
 104
 105 : ?set-nth ( val index seq -- )
 106     2dup bounds-check? [ set-nth ] [ 3drop ] if ;
 107
 108 :: fill-ranges ( table -- table )
 109     name-map >alist sort-values keys
 110     [ { [ "first>" tail? ] [ "last>" tail? ] } 1|| ] filter
 111     2 group [
 112         [ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
 113         [ swap table ?set-nth ] curry each
 114     ] assoc-each table ;
 115
 116 :: process-category ( data -- category-listing )
 117     [let | table [ num-chars <byte-array> ] |
 118         2 data (process-data) [| char cat |
 119             cat categories index char table ?set-nth
 120         ] assoc-each table fill-ranges ] ;
 121
 122 : ascii-lower ( string -- lower )
 123     [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
 124
 125 : process-names ( data -- names-hash )
 126     1 swap (process-data) [
 127         ascii-lower { { CHAR: \s CHAR: - } } substitute swap
 128     ] H{ } assoc-map-as ;
 129
 130 : multihex ( hexstring -- string )
 131     " " split [ hex> ] map sift ;
 132
 133 TUPLE: code-point lower title upper ;
 134
 135 C: <code-point> code-point
 136
 137 : set-code-point ( seq -- )
 138     4 head [ multihex ] map first4
 139     <code-point> swap first set ;
 140
 141 ! Extra properties
 142 : properties-lines ( -- lines )
 143     "resource:basis/unicode/data/PropList.txt"
 144     ascii file-lines ;
 145
 146 : parse-properties ( -- {{[a,b],prop}} )
 147     properties-lines filter-comments [
 148         split-; first2
 149         [ ".." split1 [ dup ] unless* [ hex> ] bi@ 2array ] dip
 150     ] { } map>assoc ;
 151
 152 : properties>intervals ( properties -- assoc[str,interval] )
 153     dup values prune [ f ] H{ } map>assoc
 154     [ [ push-at ] curry assoc-each ] keep
 155     [ <interval-set> ] assoc-map ;
 156
 157 : load-properties ( -- assoc )
 158     parse-properties properties>intervals ;
 159
 160 ! Special casing data
 161 : load-special-casing ( -- special-casing )
 162     "resource:basis/unicode/data/SpecialCasing.txt" data
 163     [ length 5 = ] filter
 164     [ [ set-code-point ] each ] H{ } make-assoc ;
 165
 166 load-data {
 167     [ process-names to: name-map ]
 168     [ 13 swap process-data to: simple-lower ]
 169     [ 12 swap process-data to: simple-upper ]
 170     [ 14 swap process-data simple-upper assoc-union to: simple-title ]
 171     [ process-combining to: class-map ]
 172     [ process-canonical to: canonical-map to: combine-map ]
 173     [ process-compatibility to: compatibility-map ]
 174     [ process-category to: category-map ]
 175 } cleave
 176
 177 load-special-casing to: special-casing
 178
 179 load-properties to: properties