From 8f3ce6f49a2f65bc21a39707abf85ce9f8d69ed2 Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Tue, 26 May 2020 10:05:09 -0700 Subject: [PATCH] punycode: adding basic support for Punycode (RFC 3492). --- extra/punycode/authors.txt | 1 + extra/punycode/punycode-tests.factor | 149 ++++++++++++++++++++++++ extra/punycode/punycode.factor | 164 +++++++++++++++++++++++++++ extra/punycode/summary.txt | 1 + 4 files changed, 315 insertions(+) create mode 100644 extra/punycode/authors.txt create mode 100644 extra/punycode/punycode-tests.factor create mode 100644 extra/punycode/punycode.factor create mode 100644 extra/punycode/summary.txt diff --git a/extra/punycode/authors.txt b/extra/punycode/authors.txt new file mode 100644 index 0000000000..e091bb8164 --- /dev/null +++ b/extra/punycode/authors.txt @@ -0,0 +1 @@ +John Benediktsson diff --git a/extra/punycode/punycode-tests.factor b/extra/punycode/punycode-tests.factor new file mode 100644 index 0000000000..e31550ab7a --- /dev/null +++ b/extra/punycode/punycode-tests.factor @@ -0,0 +1,149 @@ + +USING: arrays assocs fry kernel punycode tools.test ; + +{ + ! Wikipedia Examples + + { "" "" } + { "A" "A-" } + { "3" "3-" } + { "-" "--" } + { "--" "---" } + { "abc" "abc-" } + { "London" "London-" } + { "Lloyd-Atkinson" "Lloyd-Atkinson-" } + { "This has spaces" "This has spaces-" } + { "ü" "tda" } + { "αβγ" "mxacd" } + { "München" "Mnchen-3ya" } + { "Mnchen-3ya" "Mnchen-3ya-" } + { "München-Ost" "Mnchen-Ost-9db" } + { "Bahnhof München-Ost" "Bahnhof Mnchen-Ost-u6b" } + + ! (A) Arabic (Egyptian): + { + "\u{0644}\u{064A}\u{0647}\u{0645}\u{0627}\u{0628}\u{062A}\u{0643}\u{0644}\u{0645}\u{0648}\u{0634}\u{0639}\u{0631}\u{0628}\u{064A}\u{061F}" + "egbpdaj6bu4bxfgehfvwxn" + } + + ! (B) Chinese (simplified): + { + "\u{4ED6}\u{4EEC}\u{4E3A}\u{4EC0}\u{4E48}\u{4E0D}\u{8BF4}\u{4E2D}\u{6587}" + "ihqwcrb4cv8a8dqg056pqjye" + } + + ! (C) Chinese (traditional): + { + "\u{4ED6}\u{5011}\u{7232}\u{4EC0}\u{9EBD}\u{4E0D}\u{8AAA}\u{4E2D}\u{6587}" + "ihqwctvzc91f659drss3x8bo0yb" + } + + ! (D) Czech: + { + "\u{0050}\u{0072}\u{006F}\u{010D}\u{0070}\u{0072}\u{006F}\u{0073}\u{0074}\u{011B}\u{006E}\u{0065}\u{006D}\u{006C}\u{0075}\u{0076}\u{00ED}\u{010D}\u{0065}\u{0073}\u{006B}\u{0079}" + "Proprostnemluvesky-uyb24dma41a" + } + + ! (E) Hebrew: + { + "\u{05DC}\u{05DE}\u{05D4}\u{05D4}\u{05DD}\u{05E4}\u{05E9}\u{05D5}\u{05D8}\u{05DC}\u{05D0}\u{05DE}\u{05D3}\u{05D1}\u{05E8}\u{05D9}\u{05DD}\u{05E2}\u{05D1}\u{05E8}\u{05D9}\u{05EA}" + "4dbcagdahymbxekheh6e0a7fei0b" + } + + ! (F) Hindi (Devanagari): + { + "\u{092F}\u{0939}\u{0932}\u{094B}\u{0917}\u{0939}\u{093F}\u{0928}\u{094D}\u{0926}\u{0940}\u{0915}\u{094D}\u{092F}\u{094B}\u{0902}\u{0928}\u{0939}\u{0940}\u{0902}\u{092C}\u{094B}\u{0932}\u{0938}\u{0915}\u{0924}\u{0947}\u{0939}\u{0948}\u{0902}" + "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" + } + + ! (G) Japanese (kanji and hiragana): + { + "\u{306A}\u{305C}\u{307F}\u{3093}\u{306A}\u{65E5}\u{672C}\u{8A9E}\u{3092}\u{8A71}\u{3057}\u{3066}\u{304F}\u{308C}\u{306A}\u{3044}\u{306E}\u{304B}" + "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" + } + + ! (H) Korean (Hangul syllables): + { + "\u{C138}\u{ACC4}\u{C758}\u{BAA8}\u{B4E0}\u{C0AC}\u{B78C}\u{B4E4}\u{C774}\u{D55C}\u{AD6D}\u{C5B4}\u{B97C}\u{C774}\u{D574}\u{D55C}\u{B2E4}\u{BA74}\u{C5BC}\u{B9C8}\u{B098}\u{C88B}\u{C744}\u{AE4C}" + "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" + } + + ! (I) Russian (Cyrillic): + { + "\u{043F}\u{043E}\u{0447}\u{0435}\u{043C}\u{0443}\u{0436}\u{0435}\u{043E}\u{043D}\u{0438}\u{043D}\u{0435}\u{0433}\u{043E}\u{0432}\u{043E}\u{0440}\u{044F}\u{0442}\u{043F}\u{043E}\u{0440}\u{0443}\u{0441}\u{0441}\u{043A}\u{0438}" + ! FIXME: example has upper-case? "b1abfaaepdrnnbgefbaDotcwatmq2g4l" + "b1abfaaepdrnnbgefbadotcwatmq2g4l" + } + + ! (J) Spanish: + { + "\u{0050}\u{006F}\u{0072}\u{0071}\u{0075}\u{00E9}\u{006E}\u{006F}\u{0070}\u{0075}\u{0065}\u{0064}\u{0065}\u{006E}\u{0073}\u{0069}\u{006D}\u{0070}\u{006C}\u{0065}\u{006D}\u{0065}\u{006E}\u{0074}\u{0065}\u{0068}\u{0061}\u{0062}\u{006C}\u{0061}\u{0072}\u{0065}\u{006E}\u{0045}\u{0073}\u{0070}\u{0061}\u{00F1}\u{006F}\u{006C}" + "PorqunopuedensimplementehablarenEspaol-fmd56a" + } + + ! (K) Vietnamese: + { + "\u{0054}\u{1EA1}\u{0069}\u{0073}\u{0061}\u{006F}\u{0068}\u{1ECD}\u{006B}\u{0068}\u{00F4}\u{006E}\u{0067}\u{0074}\u{0068}\u{1EC3}\u{0063}\u{0068}\u{1EC9}\u{006E}\u{00F3}\u{0069}\u{0074}\u{0069}\u{1EBF}\u{006E}\u{0067}\u{0056}\u{0069}\u{1EC7}\u{0074}" + "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" + } + + ! (L) 3B + { + "\u{0033}\u{5E74}\u{0042}\u{7D44}\u{91D1}\u{516B}\u{5148}\u{751F}" + "3B-ww4c5e180e575a65lsy2b" + } + + ! (M) -with-SUPER-MONKEYS + { + "\u{5B89}\u{5BA4}\u{5948}\u{7F8E}\u{6075}\u{002D}\u{0077}\u{0069}\u{0074}\u{0068}\u{002D}\u{0053}\u{0055}\u{0050}\u{0045}\u{0052}\u{002D}\u{004D}\u{004F}\u{004E}\u{004B}\u{0045}\u{0059}\u{0053}" + "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" + } + + ! (N) Hello-Another-Way- + { + "\u{0048}\u{0065}\u{006C}\u{006C}\u{006F}\u{002D}\u{0041}\u{006E}\u{006F}\u{0074}\u{0068}\u{0065}\u{0072}\u{002D}\u{0057}\u{0061}\u{0079}\u{002D}\u{305D}\u{308C}\u{305E}\u{308C}\u{306E}\u{5834}\u{6240}" + "Hello-Another-Way--fc4qua05auwb3674vfr0b" + } + + ! (O) 2 + { + "\u{3072}\u{3068}\u{3064}\u{5C4B}\u{6839}\u{306E}\u{4E0B}\u{0032}" + "2-u9tlzr9756bt3uc0v" + } + + ! (P) MajiKoi5 + { + "\u{004D}\u{0061}\u{006A}\u{0069}\u{3067}\u{004B}\u{006F}\u{0069}\u{3059}\u{308B}\u{0035}\u{79D2}\u{524D}" + "MajiKoi5-783gue6qz075azm5e" + } + + ! (Q) de + { + "\u{30D1}\u{30D5}\u{30A3}\u{30FC}\u{0064}\u{0065}\u{30EB}\u{30F3}\u{30D0}" + "de-jg4avhby1noc0d" + } + + ! (R) + { + "\u{305D}\u{306E}\u{30B9}\u{30D4}\u{30FC}\u{30C9}\u{3067}" + "d9juau41awczczp" + } + + ! (S) -> $1.00 <- + { + "\u{002D}\u{003E}\u{0020}\u{0024}\u{0031}\u{002E}\u{0030}\u{0030}\u{0020}\u{003C}\u{002D}" + "-> $1.00 <--" + } +} [ + [ [ 1array ] dip '[ _ punycode> ] unit-test ] + [ 1array swap '[ _ >punycode ] unit-test ] 2bi +] assoc-each + +{ + { "😉.com" "xn--n28h.com" } + { "💩.la" "xn--ls8h.la" } + { "са.com" "xn--80a7a.com" } ! phishing +} [ + [ [ 1array ] dip '[ _ idna> ] unit-test ] + [ 1array swap '[ _ >idna ] unit-test ] 2bi +] assoc-each diff --git a/extra/punycode/punycode.factor b/extra/punycode/punycode.factor new file mode 100644 index 0000000000..f69196a0bd --- /dev/null +++ b/extra/punycode/punycode.factor @@ -0,0 +1,164 @@ +! Copyright (C) 2020 John Benediktsson +! See http://factorcode.org/license.txt for BSD license + +USING: ascii byte-arrays combinators kernel literals locals math +math.order sbufs sequences sequences.extras sets sorting +splitting ; + +IN: punycode + +byte-array ] +>> + +: threshold ( j bias -- T ) + [ BASE * ] [ - ] bi* TMIN TMAX clamp ; + +:: adapt ( delta! #chars first? -- bias ) + delta first? DAMP 2 ? /i delta! + delta dup #chars /i + delta! + 0 [ delta $[ BASE TMIN - TMAX * 2 /i ] > ] [ + delta $[ BASE TMIN - ] /i delta! + BASE + + ] while BASE delta * delta SKEW + /i + ; + +: segregate ( str -- base extended ) + [ N < ] partition members natural-sort ; + +:: find-pos ( str ch i pos -- i' pos' ) + i pos 1 + str [ + ch <=> { + { +eq+ [ 1 + t ] } + { +lt+ [ 1 + f ] } + [ drop f ] + } case + ] find-from drop [ drop -1 -1 ] unless* ; + +:: insertion-unsort ( str extended -- deltas ) + V{ } clone :> accum + N :> oldch! + -1 :> oldi! + extended [| ch | + -1 :> i! + -1 :> pos! + str [ ch < ] count :> curlen + curlen 1 + ch oldch - * :> delta! + [ + str ch i pos find-pos pos! i! + i -1 = [ + f + ] [ + i oldi - delta + delta! + delta 1 - accum push + i oldi! + 0 delta! + t + ] if + ] loop + ch oldch! + ] each accum ; + +:: encode-delta ( delta! bias -- seq ) + SBUF" " clone :> accum + 0 :> j! + [ + j 1 + j! + j bias threshold :> T + delta T < [ + f + delta + ] [ + t + delta T - BASE T - /mod T + swap delta! + ] if DIGITS nth accum push + ] loop accum ; + +:: encode-deltas ( baselen deltas -- seq ) + SBUF" " clone :> accum + BIAS :> bias! + deltas [| delta i | + delta bias encode-delta accum push-all + delta baselen i + 1 + i 0 = adapt bias! + ] each-index accum ; + +PRIVATE> + +:: >punycode ( str -- punicode ) + str segregate :> ( base extended ) + str extended insertion-unsort :> deltas + base length deltas encode-deltas + base [ "-" rot 3append ] unless-empty "" like ; + + delta! + 1 :> w! + 0 :> j! + [ + j 1 + j! + j bias threshold :> T + extpos extended nth decode-digit :> digit + extpos 1 + extpos! + digit w * delta + delta! + BASE T - w * w! + digit T >= + ] loop extpos delta ; + +ERROR: invalid-character char ; + +:: insertion-sort ( base extended -- base ) + N :> ch! + -1 :> pos! + BIAS :> bias! + 0 :> extpos! + extended length :> extlen + [ extpos extlen < ] [ + extended extpos bias decode-delta :> ( newpos delta ) + delta 1 + pos + pos! + pos base length 1 + /mod pos! ch + ch! + ch 0x10FFFF > [ ch invalid-character ] when + ch pos base insert-nth! + delta base length extpos 0 = adapt bias! + newpos extpos! + ] while base ; + +PRIVATE> + +: punycode> ( punycode -- str ) + CHAR: - over last-index [ + ! FIXME: assert all non-basic code-points + [ head >sbuf ] [ 1 + tail ] 2bi >upper + ] [ + SBUF" " clone swap >upper + ] if* insertion-sort "" like ; + +: idna> ( punycode -- str ) + "." split [ + "xn--" ?head [ punycode> ] when + ] map "." join ; + +: >idna ( str -- punycode ) + "." split [ + dup [ N < ] all? [ + >punycode "xn--" prepend + ] unless + ] map "." join ; diff --git a/extra/punycode/summary.txt b/extra/punycode/summary.txt new file mode 100644 index 0000000000..c4f151baba --- /dev/null +++ b/extra/punycode/summary.txt @@ -0,0 +1 @@ +Punycode and Internationalized Domain Names -- 2.34.1