]> gitweb.factorcode.org Git - factor.git/blob - extra/robots/robots-tests.factor
54b489268018b6d339121ad82e21d17e3d3dd397
[factor.git] / extra / robots / robots-tests.factor
1 ! Copyright (C) 2009 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: calendar io.encodings.utf8 io.files robots tools.test
4 urls ;
5 IN: robots.tests
6
7 [
8     { "http://www.chiplist.com/sitemap.txt" }
9     {
10         T{ rules
11             { user-agents V{ "*" } }
12             { allows V{ } }
13             { disallows
14                 V{
15                     URL" /cgi-bin/"
16                     URL" /scripts/"
17                     URL" /ChipList2/scripts/"
18                     URL" /ChipList2/styles/"
19                     URL" /ads/"
20                     URL" /ChipList2/ads/"
21                     URL" /advertisements/"
22                     URL" /ChipList2/advertisements/"
23                     URL" /graphics/"
24                     URL" /ChipList2/graphics/"
25                 }
26             }
27             { visit-time
28                 {
29                     T{ timestamp { hour 2 } }
30                     T{ timestamp { hour 5 } }
31                 }
32             }
33             { request-rate 1 }
34             { crawl-delay 1 }
35             { unknowns H{ } }
36         }
37         T{ rules
38             { user-agents V{ "UbiCrawler" } }
39             { allows V{ } }
40             { disallows V{ URL" /" } }
41             { unknowns H{ } }
42         }
43         T{ rules
44             { user-agents V{ "DOC" } }
45             { allows V{ } }
46             { disallows V{ URL" /" } }
47             { unknowns H{ } }
48         }
49         T{ rules
50             { user-agents V{ "Zao" } }
51             { allows V{ } }
52             { disallows V{ URL" /" } }
53             { unknowns H{ } }
54         }
55         T{ rules
56             { user-agents V{ "sitecheck.internetseer.com" } }
57             { allows V{ } }
58             { disallows V{ URL" /" } }
59             { unknowns H{ } }
60         }
61         T{ rules
62             { user-agents V{ "Zealbot" } }
63             { allows V{ } }
64             { disallows V{ URL" /" } }
65             { unknowns H{ } }
66         }
67         T{ rules
68             { user-agents V{ "MSIECrawler" } }
69             { allows V{ } }
70             { disallows V{ URL" /" } }
71             { unknowns H{ } }
72         }
73         T{ rules
74             { user-agents V{ "SiteSnagger" } }
75             { allows V{ } }
76             { disallows V{ URL" /" } }
77             { unknowns H{ } }
78         }
79         T{ rules
80             { user-agents V{ "WebStripper" } }
81             { allows V{ } }
82             { disallows V{ URL" /" } }
83             { unknowns H{ } }
84         }
85         T{ rules
86             { user-agents V{ "WebCopier" } }
87             { allows V{ } }
88             { disallows V{ URL" /" } }
89             { unknowns H{ } }
90         }
91         T{ rules
92             { user-agents V{ "Fetch" } }
93             { allows V{ } }
94             { disallows V{ URL" /" } }
95             { unknowns H{ } }
96         }
97         T{ rules
98             { user-agents V{ "Offline Explorer" } }
99             { allows V{ } }
100             { disallows V{ URL" /" } }
101             { unknowns H{ } }
102         }
103         T{ rules
104             { user-agents V{ "Teleport" } }
105             { allows V{ } }
106             { disallows V{ URL" /" } }
107             { unknowns H{ } }
108         }
109         T{ rules
110             { user-agents V{ "TeleportPro" } }
111             { allows V{ } }
112             { disallows V{ URL" /" } }
113             { unknowns H{ } }
114         }
115         T{ rules
116             { user-agents V{ "WebZIP" } }
117             { allows V{ } }
118             { disallows V{ URL" /" } }
119             { unknowns H{ } }
120         }
121         T{ rules
122             { user-agents V{ "linko" } }
123             { allows V{ } }
124             { disallows V{ URL" /" } }
125             { unknowns H{ } }
126         }
127         T{ rules
128             { user-agents V{ "HTTrack" } }
129             { allows V{ } }
130             { disallows V{ URL" /" } }
131             { unknowns H{ } }
132         }
133         T{ rules
134             { user-agents V{ "Microsoft.URL.Control" } }
135             { allows V{ } }
136             { disallows V{ URL" /" } }
137             { unknowns H{ } }
138         }
139         T{ rules
140             { user-agents V{ "Xenu" } }
141             { allows V{ } }
142             { disallows V{ URL" /" } }
143             { unknowns H{ } }
144         }
145         T{ rules
146             { user-agents V{ "larbin" } }
147             { allows V{ } }
148             { disallows V{ URL" /" } }
149             { unknowns H{ } }
150         }
151         T{ rules
152             { user-agents V{ "libwww" } }
153             { allows V{ } }
154             { disallows V{ URL" /" } }
155             { unknowns H{ } }
156         }
157         T{ rules
158             { user-agents V{ "ZyBORG" } }
159             { allows V{ } }
160             { disallows V{ URL" /" } }
161             { unknowns H{ } }
162         }
163         T{ rules
164             { user-agents V{ "Download Ninja" } }
165             { allows V{ } }
166             { disallows V{ URL" /" } }
167             { unknowns H{ } }
168         }
169         T{ rules
170             { user-agents V{ "wget" } }
171             { allows V{ } }
172             { disallows V{ URL" /" } }
173             { unknowns H{ } }
174         }
175         T{ rules
176             { user-agents V{ "grub-client" } }
177             { allows V{ } }
178             { disallows V{ URL" /" } }
179             { unknowns H{ } }
180         }
181         T{ rules
182             { user-agents V{ "k2spider" } }
183             { allows V{ } }
184             { disallows V{ URL" /" } }
185             { unknowns H{ } }
186         }
187         T{ rules
188             { user-agents V{ "NPBot" } }
189             { allows V{ } }
190             { disallows V{ URL" /" } }
191             { unknowns H{ } }
192         }
193         T{ rules
194             { user-agents V{ "WebReaper" } }
195             { allows V{ } }
196             { disallows V{ URL" /" } }
197             { unknowns H{ } }
198         }
199         T{ rules
200             { user-agents
201                 V{
202                     "abot"
203                     "ALeadSoftbot"
204                     "BeijingCrawler"
205                     "BilgiBot"
206                     "bot"
207                     "botlist"
208                     "BOTW Spider"
209                     "bumblebee"
210                     "Bumblebee"
211                     "BuzzRankingBot"
212                     "Charlotte"
213                     "Clushbot"
214                     "Crawler"
215                     "CydralSpider"
216                     "DataFountains"
217                     "DiamondBot"
218                     "Dulance bot"
219                     "DYNAMIC"
220                     "EARTHCOM.info"
221                     "EDI"
222                     "envolk"
223                     "Exabot"
224                     "Exabot-Images"
225                     "Exabot-Test"
226                     "exactseek-pagereaper"
227                     "Exalead NG"
228                     "FANGCrawl"
229                     "Feed::Find"
230                     "flatlandbot"
231                     "Gigabot"
232                     "GigabotSiteSearch"
233                     "GurujiBot"
234                     "Hatena Antenna"
235                     "Hatena Bookmark"
236                     "Hatena RSS"
237                     "HatenaScreenshot"
238                     "Helix"
239                     "HiddenMarket"
240                     "HyperEstraier"
241                     "iaskspider"
242                     "IIITBOT"
243                     "InfociousBot"
244                     "iVia"
245                     "iVia Page Fetcher"
246                     "Jetbot"
247                     "Kolinka Forum Search"
248                     "KRetrieve"
249                     "LetsCrawl.com"
250                     "Lincoln State Web Browser"
251                     "Links4US-Crawler"
252                     "LOOQ"
253                     "Lsearch/sondeur"
254                     "MapoftheInternet.com"
255                     "NationalDirectory"
256                     "NetCarta_WebMapper"
257                     "NewsGator"
258                     "NextGenSearchBot"
259                     "ng"
260                     "nicebot"
261                     "NP"
262                     "NPBot"
263                     "Nudelsalat"
264                     "Nutch"
265                     "OmniExplorer_Bot"
266                     "OpenIntelligenceData"
267                     "Oracle Enterprise Search"
268                     "Pajaczek"
269                     "panscient.com"
270                     "PeerFactor 404 crawler"
271                     "PeerFactor Crawler"
272                     "PlantyNet"
273                     "PlantyNet_WebRobot"
274                     "plinki"
275                     "PMAFind"
276                     "Pogodak!"
277                     "QuickFinder Crawler"
278                     "Radiation Retriever"
279                     "Reaper"
280                     "RedCarpet"
281                     "ScorpionBot"
282                     "Scrubby"
283                     "Scumbot"
284                     "searchbot"
285                     "Seeker.lookseek.com"
286                     "SeznamBot"
287                     "ShowXML"
288                     "snap.com"
289                     "snap.com beta crawler"
290                     "Snapbot"
291                     "SnapPreviewBot"
292                     "sohu"
293                     "SpankBot"
294                     "Speedy Spider"
295                     "Speedy_Spider"
296                     "SpeedySpider"
297                     "spider"
298                     "SquigglebotBot"
299                     "SurveyBot"
300                     "SynapticSearch"
301                     "T-H-U-N-D-E-R-S-T-O-N-E"
302                     "Talkro Web-Shot"
303                     "Tarantula"
304                     "TerrawizBot"
305                     "TheInformant"
306                     "TMCrawler"
307                     "TridentSpider"
308                     "Tutorial Crawler"
309                     "Twiceler"
310                     "unwrapbot"
311                     "URI::Fetch"
312                     "VengaBot"
313                     "Vonna.com b o t"
314                     "Vortex"
315                     "Votay bot"
316                     "WebAlta Crawler"
317                     "Webbot"
318                     "Webclipping.com"
319                     "WebCorp"
320                     "Webinator"
321                     "WIRE"
322                     "WISEbot"
323                     "Xerka WebBot"
324                     "XSpider"
325                     "YodaoBot"
326                     "Yoono"
327                     "yoono"
328                 }
329             }
330             { allows V{ } }
331             { disallows V{ URL" /" } }
332             { unknowns H{ } }
333         }
334     }
335 ] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test