1 ! Copyright (C) 2020 John Benediktsson
2 ! See http://factorcode.org/license.txt for BSD license
4 USING: accessors arrays ascii assocs byte-arrays combinators
5 io.encodings.string io.encodings.utf8 kernel lexer linked-assocs
6 literals locals make math math.order math.parser multiline
7 namespaces peg.ebnf present prettyprint.backend
8 prettyprint.custom prettyprint.sections regexp sbufs sequences
9 sequences.extras sequences.generalizations sets sorting
10 splitting strings strings.parser urls urls.encoding
11 urls.encoding.private urls.private ;
25 CONSTANT: DIGITS $[ "abcdefghijklmnopqrstuvwxyz0123456789" >byte-array ]
28 : threshold ( j bias -- T )
29 [ BASE * ] [ - ] bi* TMIN TMAX clamp ;
31 :: adapt ( delta! #chars first? -- bias )
32 delta first? DAMP 2 ? /i delta!
33 delta dup #chars /i + delta!
34 0 [ delta $[ BASE TMIN - TMAX * 2 /i ] > ] [
35 delta $[ BASE TMIN - ] /i delta!
37 ] while BASE delta * delta SKEW + /i + ;
39 : segregate ( str -- base extended )
40 [ N < ] partition members natural-sort ;
42 :: find-pos ( str ch i pos -- i' pos' )
49 ] find-from drop [ drop -1 -1 ] unless* ;
51 :: insertion-unsort ( str extended -- deltas )
58 str [ ch < ] count :> curlen
59 curlen 1 + ch oldch - * :> delta!
61 str ch i pos find-pos pos! i!
65 i oldi - delta + delta!
75 :: encode-delta ( delta! bias -- seq )
76 SBUF" " clone :> accum
86 delta T - BASE T - /mod T + swap delta!
87 ] if DIGITS nth accum push
90 :: encode-deltas ( baselen deltas -- seq )
91 SBUF" " clone :> accum
94 delta bias encode-delta accum push-all
95 delta baselen i + 1 + i 0 = adapt bias!
100 :: >punycode ( str -- punicode )
101 str segregate :> ( base extended )
102 str extended insertion-unsort :> deltas
103 base length deltas encode-deltas
104 base [ "-" rot 3append ] unless-empty "" like ;
108 ERROR: invalid-digit char ;
110 :: decode-digit ( ch -- digit )
112 { [ ch CHAR: A CHAR: Z between? ] [ ch CHAR: A - ] }
113 { [ ch CHAR: 0 CHAR: 9 between? ] [ ch CHAR: 0 26 - - ] }
117 :: decode-delta ( extended extpos! bias -- extpos' delta )
123 j bias threshold :> T
124 extpos extended nth decode-digit :> digit
126 digit w * delta + delta!
129 ] loop extpos delta ;
131 ERROR: invalid-character char ;
133 :: insertion-sort ( base extended -- base )
138 extended length :> extlen
139 [ extpos extlen < ] [
140 extended extpos bias decode-delta :> ( newpos delta )
142 pos base length 1 + /mod pos! ch + ch!
143 ch 0x10FFFF > [ ch invalid-character ] when
144 ch pos base insert-nth!
145 delta base length extpos 0 = adapt bias!
151 : punycode> ( punycode -- str )
152 CHAR: - over last-index [
153 ! FIXME: assert all non-basic code-points
154 [ head >sbuf ] [ 1 + tail ] 2bi >upper
156 SBUF" " clone swap >upper
157 ] if* insertion-sort "" like ;
159 : idna> ( punycode -- str )
161 "xn--" ?head [ punycode> ] when
164 : >idna ( str -- punycode )
167 >punycode "xn--" prepend
173 : <irl> ( -- irl ) irl new ;
175 GENERIC: >irl ( obj -- irl )
177 M: f >irl drop <irl> ;
181 : irl-decode ( str -- str' )
182 "" like R/ (%[a-fA-F0-9]{2})+/ [ url-decode ] re-replace-with ;
184 : iquery-decode ( str -- decoded )
185 "+" split "%20" join irl-decode ;
187 : iquery>assoc ( query -- assoc )
189 "&;" split <linked-hash> [
191 [ "=" split1 [ dup [ iquery-decode ] when ] bi@ swap ] dip
197 : assoc>iquery ( assoc -- str )
201 dup array? [ 1array ] unless
202 [ "=" glue , ] with each
205 ] { } make "&" join ;
210 protocol = [a-zA-Z0-9.+-]+ => [[ irl-decode ]]
211 username = [^/:@#?]+ => [[ irl-decode ]]
212 password = [^/:@#?]+ => [[ irl-decode ]]
213 path = [^#?]+ => [[ irl-decode ]]
214 query = [^#]+ => [[ iquery>assoc ]]
215 anchor = .+ => [[ irl-decode ]]
216 hostname = [^/#?:]+ => [[ irl-decode ]]
217 port = [^/#?]+ => [[ url-decode parse-port ]]
219 auth = username (":"~ password)? "@"~
220 host = hostname (":"~ port)?
222 url = (protocol ":"~)?
230 : unparse-ihost-part ( url -- )
232 [ unparse-username-password ]
234 [ url-port [ ":" % # ] when* ]
235 [ path>> "/" head? [ "/" % ] unless ]
238 : unparse-iauthority ( url -- )
239 dup host>> [ "//" % unparse-ihost-part ] [ drop ] if ;
245 [ unparse-iauthority ]
247 [ query>> dup assoc-empty? [ drop ] [ "?" % assoc>iquery % ] if ]
248 [ anchor>> [ "#" % present % ] when* ]
255 [ <irl> ] dip parse-irl 5 firstn {
256 [ >lower >>protocol ]
259 [ first [ first2 [ >>username ] [ >>password ] bi* ] when* ]
260 [ second [ first2 [ >>host ] [ >>port ] bi* ] when* ] bi
266 } spread dup host>> [ [ "/" or ] change-path ] when ;
270 [ protocol>> >>protocol ]
271 [ username>> >>username ]
272 [ password>> >>password ]
273 [ host>> [ >idna ] [ f ] if* >>host ]
277 [ anchor>> >>anchor ]
282 [ protocol>> >>protocol ]
283 [ username>> >>username ]
284 [ password>> >>password ]
285 [ host>> [ idna> ] [ f ] if* >>host ]
289 [ anchor>> >>anchor ]
292 SYNTAX: IRL" lexer get skip-blank parse-string >irl suffix! ;
296 dup present "IRL\" " "\"" pprint-string ;