]> gitweb.factorcode.org Git - factor.git/commitdiff
fix unit test in robots
authorDoug Coleman <erg@jobim.local>
Tue, 7 Apr 2009 15:08:01 +0000 (10:08 -0500)
committerDoug Coleman <erg@jobim.local>
Tue, 7 Apr 2009 15:08:01 +0000 (10:08 -0500)
extra/robots/robots-tests.factor
extra/robots/robots.factor

index a590d9eee0d6b377c12b11353da68eedc544779d..54b489268018b6d339121ad82e21d17e3d3dd397 100644 (file)
 ! Copyright (C) 2009 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: calendar io.encodings.utf8 io.files robots tools.test ;
+USING: calendar io.encodings.utf8 io.files robots tools.test
+urls ;
 IN: robots.tests
 
 [
-{ "http://www.chiplist.com/sitemap.txt" }
-{
-    T{ rules
-        { user-agents V{ "*" } }
-        { allows V{ } }
-        { disallows
-            V{
-                "/cgi-bin/"
-                "/scripts/"
-                "/ChipList2/scripts/"
-                "/ChipList2/styles/"
-                "/ads/"
-                "/ChipList2/ads/"
-                "/advertisements/"
-                "/ChipList2/advertisements/"
-                "/graphics/"
-                "/ChipList2/graphics/"
+    { "http://www.chiplist.com/sitemap.txt" }
+    {
+        T{ rules
+            { user-agents V{ "*" } }
+            { allows V{ } }
+            { disallows
+                V{
+                    URL" /cgi-bin/"
+                    URL" /scripts/"
+                    URL" /ChipList2/scripts/"
+                    URL" /ChipList2/styles/"
+                    URL" /ads/"
+                    URL" /ChipList2/ads/"
+                    URL" /advertisements/"
+                    URL" /ChipList2/advertisements/"
+                    URL" /graphics/"
+                    URL" /ChipList2/graphics/"
+                }
             }
-        }
-        { visit-time
-            {
-                T{ timestamp { hour 2 } }
-                T{ timestamp { hour 5 } }
+            { visit-time
+                {
+                    T{ timestamp { hour 2 } }
+                    T{ timestamp { hour 5 } }
+                }
             }
+            { request-rate 1 }
+            { crawl-delay 1 }
+            { unknowns H{ } }
         }
-        { request-rate 1 }
-        { crawl-delay 1 }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "UbiCrawler" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "DOC" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Zao" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "sitecheck.internetseer.com" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Zealbot" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "MSIECrawler" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "SiteSnagger" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "WebStripper" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "WebCopier" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Fetch" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Offline Explorer" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Teleport" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "TeleportPro" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "WebZIP" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "linko" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "HTTrack" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Microsoft.URL.Control" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Xenu" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "larbin" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "libwww" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "ZyBORG" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "Download Ninja" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "wget" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "grub-client" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "k2spider" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "NPBot" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents V{ "WebReaper" } }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
-    }
-    T{ rules
-        { user-agents
-            V{
-                "abot"
-                "ALeadSoftbot"
-                "BeijingCrawler"
-                "BilgiBot"
-                "bot"
-                "botlist"
-                "BOTW Spider"
-                "bumblebee"
-                "Bumblebee"
-                "BuzzRankingBot"
-                "Charlotte"
-                "Clushbot"
-                "Crawler"
-                "CydralSpider"
-                "DataFountains"
-                "DiamondBot"
-                "Dulance bot"
-                "DYNAMIC"
-                "EARTHCOM.info"
-                "EDI"
-                "envolk"
-                "Exabot"
-                "Exabot-Images"
-                "Exabot-Test"
-                "exactseek-pagereaper"
-                "Exalead NG"
-                "FANGCrawl"
-                "Feed::Find"
-                "flatlandbot"
-                "Gigabot"
-                "GigabotSiteSearch"
-                "GurujiBot"
-                "Hatena Antenna"
-                "Hatena Bookmark"
-                "Hatena RSS"
-                "HatenaScreenshot"
-                "Helix"
-                "HiddenMarket"
-                "HyperEstraier"
-                "iaskspider"
-                "IIITBOT"
-                "InfociousBot"
-                "iVia"
-                "iVia Page Fetcher"
-                "Jetbot"
-                "Kolinka Forum Search"
-                "KRetrieve"
-                "LetsCrawl.com"
-                "Lincoln State Web Browser"
-                "Links4US-Crawler"
-                "LOOQ"
-                "Lsearch/sondeur"
-                "MapoftheInternet.com"
-                "NationalDirectory"
-                "NetCarta_WebMapper"
-                "NewsGator"
-                "NextGenSearchBot"
-                "ng"
-                "nicebot"
-                "NP"
-                "NPBot"
-                "Nudelsalat"
-                "Nutch"
-                "OmniExplorer_Bot"
-                "OpenIntelligenceData"
-                "Oracle Enterprise Search"
-                "Pajaczek"
-                "panscient.com"
-                "PeerFactor 404 crawler"
-                "PeerFactor Crawler"
-                "PlantyNet"
-                "PlantyNet_WebRobot"
-                "plinki"
-                "PMAFind"
-                "Pogodak!"
-                "QuickFinder Crawler"
-                "Radiation Retriever"
-                "Reaper"
-                "RedCarpet"
-                "ScorpionBot"
-                "Scrubby"
-                "Scumbot"
-                "searchbot"
-                "Seeker.lookseek.com"
-                "SeznamBot"
-                "ShowXML"
-                "snap.com"
-                "snap.com beta crawler"
-                "Snapbot"
-                "SnapPreviewBot"
-                "sohu"
-                "SpankBot"
-                "Speedy Spider"
-                "Speedy_Spider"
-                "SpeedySpider"
-                "spider"
-                "SquigglebotBot"
-                "SurveyBot"
-                "SynapticSearch"
-                "T-H-U-N-D-E-R-S-T-O-N-E"
-                "Talkro Web-Shot"
-                "Tarantula"
-                "TerrawizBot"
-                "TheInformant"
-                "TMCrawler"
-                "TridentSpider"
-                "Tutorial Crawler"
-                "Twiceler"
-                "unwrapbot"
-                "URI::Fetch"
-                "VengaBot"
-                "Vonna.com b o t"
-                "Vortex"
-                "Votay bot"
-                "WebAlta Crawler"
-                "Webbot"
-                "Webclipping.com"
-                "WebCorp"
-                "Webinator"
-                "WIRE"
-                "WISEbot"
-                "Xerka WebBot"
-                "XSpider"
-                "YodaoBot"
-                "Yoono"
-                "yoono"
+        T{ rules
+            { user-agents V{ "UbiCrawler" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "DOC" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Zao" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "sitecheck.internetseer.com" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Zealbot" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "MSIECrawler" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "SiteSnagger" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "WebStripper" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "WebCopier" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Fetch" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Offline Explorer" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Teleport" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "TeleportPro" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "WebZIP" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "linko" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "HTTrack" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Microsoft.URL.Control" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Xenu" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "larbin" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "libwww" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "ZyBORG" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "Download Ninja" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "wget" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "grub-client" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "k2spider" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "NPBot" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents V{ "WebReaper" } }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
+        }
+        T{ rules
+            { user-agents
+                V{
+                    "abot"
+                    "ALeadSoftbot"
+                    "BeijingCrawler"
+                    "BilgiBot"
+                    "bot"
+                    "botlist"
+                    "BOTW Spider"
+                    "bumblebee"
+                    "Bumblebee"
+                    "BuzzRankingBot"
+                    "Charlotte"
+                    "Clushbot"
+                    "Crawler"
+                    "CydralSpider"
+                    "DataFountains"
+                    "DiamondBot"
+                    "Dulance bot"
+                    "DYNAMIC"
+                    "EARTHCOM.info"
+                    "EDI"
+                    "envolk"
+                    "Exabot"
+                    "Exabot-Images"
+                    "Exabot-Test"
+                    "exactseek-pagereaper"
+                    "Exalead NG"
+                    "FANGCrawl"
+                    "Feed::Find"
+                    "flatlandbot"
+                    "Gigabot"
+                    "GigabotSiteSearch"
+                    "GurujiBot"
+                    "Hatena Antenna"
+                    "Hatena Bookmark"
+                    "Hatena RSS"
+                    "HatenaScreenshot"
+                    "Helix"
+                    "HiddenMarket"
+                    "HyperEstraier"
+                    "iaskspider"
+                    "IIITBOT"
+                    "InfociousBot"
+                    "iVia"
+                    "iVia Page Fetcher"
+                    "Jetbot"
+                    "Kolinka Forum Search"
+                    "KRetrieve"
+                    "LetsCrawl.com"
+                    "Lincoln State Web Browser"
+                    "Links4US-Crawler"
+                    "LOOQ"
+                    "Lsearch/sondeur"
+                    "MapoftheInternet.com"
+                    "NationalDirectory"
+                    "NetCarta_WebMapper"
+                    "NewsGator"
+                    "NextGenSearchBot"
+                    "ng"
+                    "nicebot"
+                    "NP"
+                    "NPBot"
+                    "Nudelsalat"
+                    "Nutch"
+                    "OmniExplorer_Bot"
+                    "OpenIntelligenceData"
+                    "Oracle Enterprise Search"
+                    "Pajaczek"
+                    "panscient.com"
+                    "PeerFactor 404 crawler"
+                    "PeerFactor Crawler"
+                    "PlantyNet"
+                    "PlantyNet_WebRobot"
+                    "plinki"
+                    "PMAFind"
+                    "Pogodak!"
+                    "QuickFinder Crawler"
+                    "Radiation Retriever"
+                    "Reaper"
+                    "RedCarpet"
+                    "ScorpionBot"
+                    "Scrubby"
+                    "Scumbot"
+                    "searchbot"
+                    "Seeker.lookseek.com"
+                    "SeznamBot"
+                    "ShowXML"
+                    "snap.com"
+                    "snap.com beta crawler"
+                    "Snapbot"
+                    "SnapPreviewBot"
+                    "sohu"
+                    "SpankBot"
+                    "Speedy Spider"
+                    "Speedy_Spider"
+                    "SpeedySpider"
+                    "spider"
+                    "SquigglebotBot"
+                    "SurveyBot"
+                    "SynapticSearch"
+                    "T-H-U-N-D-E-R-S-T-O-N-E"
+                    "Talkro Web-Shot"
+                    "Tarantula"
+                    "TerrawizBot"
+                    "TheInformant"
+                    "TMCrawler"
+                    "TridentSpider"
+                    "Tutorial Crawler"
+                    "Twiceler"
+                    "unwrapbot"
+                    "URI::Fetch"
+                    "VengaBot"
+                    "Vonna.com b o t"
+                    "Vortex"
+                    "Votay bot"
+                    "WebAlta Crawler"
+                    "Webbot"
+                    "Webclipping.com"
+                    "WebCorp"
+                    "Webinator"
+                    "WIRE"
+                    "WISEbot"
+                    "Xerka WebBot"
+                    "XSpider"
+                    "YodaoBot"
+                    "Yoono"
+                    "yoono"
+                }
             }
+            { allows V{ } }
+            { disallows V{ URL" /" } }
+            { unknowns H{ } }
         }
-        { allows V{ } }
-        { disallows V{ "/" } }
-        { unknowns H{ } }
     }
-}
 ] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test
index 242aa1dea2e7dfb9f5f3516bdb9655012df54dba..3c0eb045f7598046f8dee75b212a9cf0d06232c7 100644 (file)
@@ -85,7 +85,7 @@ PRIVATE>
 : parse-robots.txt ( string -- sitemaps rules-seq )
     normalize-robots.txt [
         [ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
-    ] map first ;
+    ] map ;
 
 : robots ( url -- robots )
     >url