From 338acc8843d5ec70c36bb154b1b86b6b04704781 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 28 Jul 2019 15:51:05 -0500 Subject: [PATCH] unicode: Minor cleanups. This fixes/breaks some of the tests. Why? { 3953 3958 3960 3968 3969 } [ canonical-map delete-at ] each --- basis/unicode/collation/collation.factor | 13 ++++--------- basis/unicode/data/data.factor | 8 ++++---- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/basis/unicode/collation/collation.factor b/basis/unicode/collation/collation.factor index 4c0ad626c1..97bb8f486f 100644 --- a/basis/unicode/collation/collation.factor +++ b/basis/unicode/collation/collation.factor @@ -32,14 +32,7 @@ TUPLE: weight-levels primary secondary tertiary ignorable? ; "vocab:unicode/UCA/allkeys.txt" parse-ducet ducet set-global -! Fix up table for long contractions -: help-one ( assoc key -- ) - ! Need to be more general? Not for DUCET, apparently - 2 head 2dup swap key? [ 2drop ] [ - [ [ 1string of ] with { } map-as concat ] - [ swap set-at ] 2bi - ] if ; - +! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET : fixup-ducet ( -- ) { { @@ -195,7 +188,8 @@ TUPLE: weight-levels primary secondary tertiary ignorable? ; } } ducet get-global '[ swap >string _ set-at ] assoc-each ; -! Add a few missing ducet values +! Add a few missing ducet values for Tibetan +! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET fixup-ducet : tangut-block? ( char -- ? ) @@ -248,6 +242,7 @@ fixup-ducet : building-last ( -- char ) building get [ 0 ] [ last last ] if-empty ; +! https://www.unicode.org/reports/tr10/tr10-41.html#Collation_Graphemes : blocked? ( char -- ? ) combining-class dup { 0 f } member? [ drop building-last non-starter? ] diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor index bc744a747d..32ac3e94fa 100644 --- a/basis/unicode/data/data.factor +++ b/basis/unicode/data/data.factor @@ -53,7 +53,7 @@ CONSTANT: categories { MEMO: categories-map ( -- hashtable ) categories H{ } zip-index-as ; -CONSTANT: num-chars 0x2FA1E +CONSTANT: NUM-CHARS 0x2FA1E PRIVATE> @@ -76,7 +76,7 @@ PRIVATE> ! Loading data from UnicodeData.txt -: load-data ( -- data ) +: load-unicode-data ( -- data ) "vocab:unicode/UCD/UnicodeData.txt" load-data-file ; : (process-data) ( index data -- newdata ) @@ -146,7 +146,7 @@ PRIVATE> ] assoc-each table ; :: process-category ( data -- category-listing ) - num-chars :> table + NUM-CHARS :> table 2 data (process-data) [| char cat | cat categories-map at char table ?set-nth ] assoc-each table fill-ranges ; @@ -194,7 +194,7 @@ C: code-point [ length 5 = ] filter [ [ set-code-point ] each ] H{ } make ; -load-data { +load-unicode-data { [ process-names name-map swap assoc-union! drop ] [ 13 swap process-data simple-lower swap assoc-union! drop ] [ 12 swap process-data simple-upper swap assoc-union! drop ] -- 2.34.1