1 ! Copyright (C) 2020 John Benediktsson
2 ! See http://factorcode.org/license.txt for BSD license
4 USING: accessors arrays ascii assocs byte-arrays combinators
5 io.encodings.string io.encodings.utf8 kernel lexer literals
6 locals make math math.order math.parser multiline namespaces
7 peg.ebnf present prettyprint.backend prettyprint.custom
8 prettyprint.sections regexp sbufs sequences sequences.extras
9 sets sorting splitting strings strings.parser urls urls.encoding
10 urls.encoding.private urls.private ;
24 CONSTANT: DIGITS $[ "abcdefghijklmnopqrstuvwxyz0123456789" >byte-array ]
27 : threshold ( j bias -- T )
28 [ BASE * ] [ - ] bi* TMIN TMAX clamp ;
30 :: adapt ( delta! #chars first? -- bias )
31 delta first? DAMP 2 ? /i delta!
32 delta dup #chars /i + delta!
33 0 [ delta $[ BASE TMIN - TMAX * 2 /i ] > ] [
34 delta $[ BASE TMIN - ] /i delta!
36 ] while BASE delta * delta SKEW + /i + ;
38 : segregate ( str -- base extended )
39 [ N < ] partition members natural-sort ;
41 :: find-pos ( str ch i pos -- i' pos' )
48 ] find-from drop [ drop -1 -1 ] unless* ;
50 :: insertion-unsort ( str extended -- deltas )
57 str [ ch < ] count :> curlen
58 curlen 1 + ch oldch - * :> delta!
60 str ch i pos find-pos pos! i!
64 i oldi - delta + delta!
74 :: encode-delta ( delta! bias -- seq )
75 SBUF" " clone :> accum
85 delta T - BASE T - /mod T + swap delta!
86 ] if DIGITS nth accum push
89 :: encode-deltas ( baselen deltas -- seq )
90 SBUF" " clone :> accum
93 delta bias encode-delta accum push-all
94 delta baselen i + 1 + i 0 = adapt bias!
99 :: >punycode ( str -- punicode )
100 str segregate :> ( base extended )
101 str extended insertion-unsort :> deltas
102 base length deltas encode-deltas
103 base [ "-" rot 3append ] unless-empty "" like ;
107 ERROR: invalid-digit char ;
109 :: decode-digit ( ch -- digit )
111 { [ ch CHAR: A CHAR: Z between? ] [ ch CHAR: A - ] }
112 { [ ch CHAR: 0 CHAR: 9 between? ] [ ch CHAR: 0 26 - - ] }
116 :: decode-delta ( extended extpos! bias -- extpos' delta )
122 j bias threshold :> T
123 extpos extended nth decode-digit :> digit
125 digit w * delta + delta!
128 ] loop extpos delta ;
130 ERROR: invalid-character char ;
132 :: insertion-sort ( base extended -- base )
137 extended length :> extlen
138 [ extpos extlen < ] [
139 extended extpos bias decode-delta :> ( newpos delta )
141 pos base length 1 + /mod pos! ch + ch!
142 ch 0x10FFFF > [ ch invalid-character ] when
143 ch pos base insert-nth!
144 delta base length extpos 0 = adapt bias!
150 : punycode> ( punycode -- str )
151 CHAR: - over last-index [
152 ! FIXME: assert all non-basic code-points
153 [ head >sbuf ] [ 1 + tail ] 2bi >upper
155 SBUF" " clone swap >upper
156 ] if* insertion-sort "" like ;
158 : idna> ( punycode -- str )
160 "xn--" ?head [ punycode> ] when
163 : >idna ( str -- punycode )
166 >punycode "xn--" prepend
172 : <irl> ( -- irl ) irl new ;
174 GENERIC: >irl ( obj -- irl )
176 M: f >irl drop <irl> ;
180 : irl-decode ( str -- str' )
181 "" like R/ (%[a-fA-F0-9]{2})+/ [ url-decode ] re-replace-with ;
186 protocol = [a-zA-Z0-9.+-]+ => [[ irl-decode ]]
187 username = [^/:@#?]+ => [[ irl-decode ]]
188 password = [^/:@#?]+ => [[ irl-decode ]]
189 pathname = [^#?]+ => [[ irl-decode ]]
190 query = [^#]+ => [[ query>assoc ]]
191 anchor = .+ => [[ irl-decode ]]
193 hostname = [^/#?]+ => [[ irl-decode ]]
195 hostname-spec = hostname ("/"|!(.)) => [[ first ]]
197 auth = (username (":" password => [[ second ]])? "@"
198 => [[ first2 2array ]])?
200 url = (((protocol "://") => [[ first ]] auth hostname)
201 | (("//") => [[ f ]] auth hostname))?
203 ("?" query => [[ second ]])?
204 ("#" anchor => [[ second ]])?
208 : unparse-ihost-part ( url -- )
210 [ unparse-username-password ]
212 [ url-port [ ":" % # ] when* ]
213 [ path>> "/" head? [ "/" % ] unless ]
216 : unparse-iauthority ( url -- )
217 dup host>> [ "//" % unparse-ihost-part ] [ drop ] if ;
223 [ unparse-iauthority ]
225 [ query>> dup assoc-empty? [ drop ] [ "?" % assoc>query % ] if ]
226 [ anchor>> [ "#" % present % ] when* ]
230 : parse-ihost ( string -- host/f port/f )
232 ":" split1-last [ irl-decode ] [
233 [ f ] [ string>number [ malformed-port ] unless* ] if-empty
244 [ first >lower >>protocol ]
247 [ first [ first2 [ >>username ] [ >>password ] bi* ] when* ]
248 [ second parse-ihost [ >>host ] [ >>port ] bi* ] bi
256 dup host>> [ [ "/" or ] change-path ] when ;
260 [ protocol>> >>protocol ]
261 [ username>> >>username ]
262 [ password>> >>password ]
263 [ host>> [ >idna url-encode ] [ f ] if* >>host ]
265 [ path>> [ url-encode ] [ f ] if* >>path ]
266 [ query>> [ url-encode ] [ f ] if* >>query ]
267 [ anchor>> [ url-encode ] [ f ] if* >>anchor ]
272 [ protocol>> >>protocol ]
273 [ username>> >>username ]
274 [ password>> >>password ]
275 [ host>> [ url-decode idna> ] [ f ] if* >>host ]
277 [ path>> [ url-decode ] [ f ] if* >>path ]
278 [ query>> [ url-decode ] [ f ] if* >>query ]
279 [ anchor>> [ url-decode ] [ f ] if* >>anchor ]
282 SYNTAX: IRL" lexer get skip-blank parse-string >irl suffix! ;
286 dup present "IRL\" " "\"" pprint-string ;