]> gitweb.factorcode.org Git - factor.git/commitdiff
Extending Url Parsing (#2354)
authorDavid Mindlin <davidmindlin4@gmail.com>
Thu, 29 Oct 2020 14:15:52 +0000 (07:15 -0700)
committerGitHub <noreply@github.com>
Thu, 29 Oct 2020 14:15:52 +0000 (09:15 -0500)
* added new test cases for urls

* Parsing URLs that do not have a authority component

* Increasing the robustness of the URL parser for passwords

* Rewrote the no authority case for urls and updated prettyprinting to
reflect the change.

* fixed matching empty port

* added upper bound for port number

* Updated the url parsing to accept an empty username.

* check if port is valid

* Updated the url tests to test new extensions to the url library.

* Updating the style for recently added extensions to urls.factor.

* removed sanity check for port, want to allow for IPvX support

Co-authored-by: David Flores <dflores0818@gmail.com>
basis/urls/urls-tests.factor
basis/urls/urls.factor

index fb14e1ecba32f449fe23aa1bd695f9d16846d2ac..0447be47e6503206b397f657dd3c98037d46f792 100644 (file)
@@ -115,6 +115,33 @@ CONSTANT: urls {
          }
         "t1000://www.google.com/"
     }
+    {
+        T{ url
+            { protocol "no-auth" }
+            { path "/some/random/path" }
+        }
+        "no-auth:/some/random/path"
+    }
+    {
+        T{ url
+            { protocol "http" }
+            { host "example.org" }
+            { path "/" }
+            { username "user" }
+            { password "" }
+        }
+        "http://user:@example.org/"
+    }
+    {
+        T{ url
+            { protocol "http" }
+            { host "example.org" }
+            { path "/" }
+            { username "" }
+            { password "pass" }
+        }
+        "http://:pass@example.org/"
+    }
 }
 
 urls [
@@ -125,6 +152,20 @@ urls [
     swap [ 1array ] [ [ present ] curry ] bi* unit-test
 ] assoc-each
 
+{ T{ url
+    { protocol "https" }
+    { host "www.google.com" }
+    { path "/" }
+   } }
+[ "https://www.google.com:/" >url ] unit-test
+
+{ "https://www.google.com/" } 
+[ T{ url
+    { protocol "https" }
+    { host "www.google.com" }
+    { path "/" }
+} present ] unit-test
+
 { "b" } [ "a" "b" url-append-path ] unit-test
 
 { "a/b" } [ "a/c" "b" url-append-path ] unit-test
index 7439b5d359519145da4a7e673afb60ff2e7c7f5b..0a1deade940e4904b15133594b524782bf82a499 100644 (file)
@@ -5,7 +5,7 @@ USING: accessors arrays ascii assocs combinators fry
 io.pathnames io.sockets io.sockets.secure kernel lexer
 linked-assocs make math.parser multiline namespaces peg.ebnf
 present sequences splitting strings strings.parser urls.encoding
-vocabs.loader ;
+vocabs.loader math math.order ;
 
 IN: urls
 
@@ -30,7 +30,10 @@ ERROR: malformed-port ;
 : parse-host ( string -- host/f port/f )
     [
         ":" split1-last [ url-decode ]
-        [ dup [ string>number [ malformed-port ] unless* ] when ] bi*
+        [ [ f ] 
+          [ string>number [ malformed-port ] unless* ]
+          if-empty 
+        ] bi*
     ] [ f f ] if* ;
 
 GENERIC: >url ( obj -- url )
@@ -44,8 +47,8 @@ M: url >url ;
 EBNF: parse-url [=[
 
 protocol = [a-zA-Z0-9.+-]+          => [[ url-decode ]]
-username = [^/:@#?]+                => [[ url-decode ]]
-password = [^/:@#?]+                => [[ url-decode ]]
+username = [^/:@#?]*                => [[ url-decode ]]
+password = [^/:@#?]*                => [[ url-decode ]]
 pathname = [^#?]+                   => [[ url-decode ]]
 query    = [^#]+                    => [[ query>assoc ]]
 anchor   = .+                       => [[ url-decode ]]
@@ -58,7 +61,8 @@ auth     = (username (":" password  => [[ second ]])? "@"
                                     => [[ first2 2array ]])?
 
 url      = (((protocol "://") => [[ first ]] auth hostname)
-                    | (("//") => [[ f ]] auth hostname))?
+                    | (("//") => [[ f ]] auth hostname)
+                    | ((protocol ":") => [[ first V{ f f } V{ } 2sequence ]]))?
            (pathname)?
            ("?" query               => [[ second ]])?
            ("#" anchor              => [[ second ]])?
@@ -117,20 +121,16 @@ M: pathname >url string>> >url ;
 
 ! URL" //foo.com" takes on the protocol of the url it's derived from
 : unparse-protocol ( url -- )
-    dup protocol>> [
-        % "://" % unparse-host-part
-    ] [
-        dup host>> [
-            "//" % unparse-host-part
-        ] [
-            drop
-        ] if
-    ] if* ;
+    protocol>> [ % ":" % ] when* ;
+
+: unparse-authority ( url -- )
+    dup host>> [ "//" % unparse-host-part ] [ drop ] if ;
 
 M: url present
     [
         {
             [ unparse-protocol ]
+            [ unparse-authority ]
             [ path>> url-encode % ]
             [ query>> dup assoc-empty? [ drop ] [ "?" % assoc>query % ] if ]
             [ anchor>> [ "#" % present url-encode % ] when* ]