]> gitweb.factorcode.org Git - factor.git/blob - extra/robots/robots-tests.factor
Fixes #2966
[factor.git] / extra / robots / robots-tests.factor
1 ! Copyright (C) 2009 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: calendar io.encodings.utf8 io.files robots tools.test
4 urls ;
5
6 {
7     { "http://www.chiplist.com/sitemap.txt" }
8     {
9         T{ rules
10             { user-agents V{ "*" } }
11             { allows V{ } }
12             { disallows
13                 V{
14                     URL" /cgi-bin/"
15                     URL" /scripts/"
16                     URL" /ChipList2/scripts/"
17                     URL" /ChipList2/styles/"
18                     URL" /ads/"
19                     URL" /ChipList2/ads/"
20                     URL" /advertisements/"
21                     URL" /ChipList2/advertisements/"
22                     URL" /graphics/"
23                     URL" /ChipList2/graphics/"
24                 }
25             }
26             { visit-time
27                 {
28                     T{ duration { hour 2 } }
29                     T{ duration { hour 5 } }
30                 }
31             }
32             { request-rate 1 }
33             { crawl-delay 1 }
34             { unknowns H{ } }
35         }
36         T{ rules
37             { user-agents V{ "UbiCrawler" } }
38             { allows V{ } }
39             { disallows V{ URL" /" } }
40             { unknowns H{ } }
41         }
42         T{ rules
43             { user-agents V{ "DOC" } }
44             { allows V{ } }
45             { disallows V{ URL" /" } }
46             { unknowns H{ } }
47         }
48         T{ rules
49             { user-agents V{ "Zao" } }
50             { allows V{ } }
51             { disallows V{ URL" /" } }
52             { unknowns H{ } }
53         }
54         T{ rules
55             { user-agents V{ "sitecheck.internetseer.com" } }
56             { allows V{ } }
57             { disallows V{ URL" /" } }
58             { unknowns H{ } }
59         }
60         T{ rules
61             { user-agents V{ "Zealbot" } }
62             { allows V{ } }
63             { disallows V{ URL" /" } }
64             { unknowns H{ } }
65         }
66         T{ rules
67             { user-agents V{ "MSIECrawler" } }
68             { allows V{ } }
69             { disallows V{ URL" /" } }
70             { unknowns H{ } }
71         }
72         T{ rules
73             { user-agents V{ "SiteSnagger" } }
74             { allows V{ } }
75             { disallows V{ URL" /" } }
76             { unknowns H{ } }
77         }
78         T{ rules
79             { user-agents V{ "WebStripper" } }
80             { allows V{ } }
81             { disallows V{ URL" /" } }
82             { unknowns H{ } }
83         }
84         T{ rules
85             { user-agents V{ "WebCopier" } }
86             { allows V{ } }
87             { disallows V{ URL" /" } }
88             { unknowns H{ } }
89         }
90         T{ rules
91             { user-agents V{ "Fetch" } }
92             { allows V{ } }
93             { disallows V{ URL" /" } }
94             { unknowns H{ } }
95         }
96         T{ rules
97             { user-agents V{ "Offline Explorer" } }
98             { allows V{ } }
99             { disallows V{ URL" /" } }
100             { unknowns H{ } }
101         }
102         T{ rules
103             { user-agents V{ "Teleport" } }
104             { allows V{ } }
105             { disallows V{ URL" /" } }
106             { unknowns H{ } }
107         }
108         T{ rules
109             { user-agents V{ "TeleportPro" } }
110             { allows V{ } }
111             { disallows V{ URL" /" } }
112             { unknowns H{ } }
113         }
114         T{ rules
115             { user-agents V{ "WebZIP" } }
116             { allows V{ } }
117             { disallows V{ URL" /" } }
118             { unknowns H{ } }
119         }
120         T{ rules
121             { user-agents V{ "linko" } }
122             { allows V{ } }
123             { disallows V{ URL" /" } }
124             { unknowns H{ } }
125         }
126         T{ rules
127             { user-agents V{ "HTTrack" } }
128             { allows V{ } }
129             { disallows V{ URL" /" } }
130             { unknowns H{ } }
131         }
132         T{ rules
133             { user-agents V{ "Microsoft.URL.Control" } }
134             { allows V{ } }
135             { disallows V{ URL" /" } }
136             { unknowns H{ } }
137         }
138         T{ rules
139             { user-agents V{ "Xenu" } }
140             { allows V{ } }
141             { disallows V{ URL" /" } }
142             { unknowns H{ } }
143         }
144         T{ rules
145             { user-agents V{ "larbin" } }
146             { allows V{ } }
147             { disallows V{ URL" /" } }
148             { unknowns H{ } }
149         }
150         T{ rules
151             { user-agents V{ "libwww" } }
152             { allows V{ } }
153             { disallows V{ URL" /" } }
154             { unknowns H{ } }
155         }
156         T{ rules
157             { user-agents V{ "ZyBORG" } }
158             { allows V{ } }
159             { disallows V{ URL" /" } }
160             { unknowns H{ } }
161         }
162         T{ rules
163             { user-agents V{ "Download Ninja" } }
164             { allows V{ } }
165             { disallows V{ URL" /" } }
166             { unknowns H{ } }
167         }
168         T{ rules
169             { user-agents V{ "wget" } }
170             { allows V{ } }
171             { disallows V{ URL" /" } }
172             { unknowns H{ } }
173         }
174         T{ rules
175             { user-agents V{ "grub-client" } }
176             { allows V{ } }
177             { disallows V{ URL" /" } }
178             { unknowns H{ } }
179         }
180         T{ rules
181             { user-agents V{ "k2spider" } }
182             { allows V{ } }
183             { disallows V{ URL" /" } }
184             { unknowns H{ } }
185         }
186         T{ rules
187             { user-agents V{ "NPBot" } }
188             { allows V{ } }
189             { disallows V{ URL" /" } }
190             { unknowns H{ } }
191         }
192         T{ rules
193             { user-agents V{ "WebReaper" } }
194             { allows V{ } }
195             { disallows V{ URL" /" } }
196             { unknowns H{ } }
197         }
198         T{ rules
199             { user-agents
200                 V{
201                     "abot"
202                     "ALeadSoftbot"
203                     "BeijingCrawler"
204                     "BilgiBot"
205                     "bot"
206                     "botlist"
207                     "BOTW Spider"
208                     "bumblebee"
209                     "Bumblebee"
210                     "BuzzRankingBot"
211                     "Charlotte"
212                     "Clushbot"
213                     "Crawler"
214                     "CydralSpider"
215                     "DataFountains"
216                     "DiamondBot"
217                     "Dulance bot"
218                     "DYNAMIC"
219                     "EARTHCOM.info"
220                     "EDI"
221                     "envolk"
222                     "Exabot"
223                     "Exabot-Images"
224                     "Exabot-Test"
225                     "exactseek-pagereaper"
226                     "Exalead NG"
227                     "FANGCrawl"
228                     "Feed::Find"
229                     "flatlandbot"
230                     "Gigabot"
231                     "GigabotSiteSearch"
232                     "GurujiBot"
233                     "Hatena Antenna"
234                     "Hatena Bookmark"
235                     "Hatena RSS"
236                     "HatenaScreenshot"
237                     "Helix"
238                     "HiddenMarket"
239                     "HyperEstraier"
240                     "iaskspider"
241                     "IIITBOT"
242                     "InfociousBot"
243                     "iVia"
244                     "iVia Page Fetcher"
245                     "Jetbot"
246                     "Kolinka Forum Search"
247                     "KRetrieve"
248                     "LetsCrawl.com"
249                     "Lincoln State Web Browser"
250                     "Links4US-Crawler"
251                     "LOOQ"
252                     "Lsearch/sondeur"
253                     "MapoftheInternet.com"
254                     "NationalDirectory"
255                     "NetCarta_WebMapper"
256                     "NewsGator"
257                     "NextGenSearchBot"
258                     "ng"
259                     "nicebot"
260                     "NP"
261                     "NPBot"
262                     "Nudelsalat"
263                     "Nutch"
264                     "OmniExplorer_Bot"
265                     "OpenIntelligenceData"
266                     "Oracle Enterprise Search"
267                     "Pajaczek"
268                     "panscient.com"
269                     "PeerFactor 404 crawler"
270                     "PeerFactor Crawler"
271                     "PlantyNet"
272                     "PlantyNet_WebRobot"
273                     "plinki"
274                     "PMAFind"
275                     "Pogodak!"
276                     "QuickFinder Crawler"
277                     "Radiation Retriever"
278                     "Reaper"
279                     "RedCarpet"
280                     "ScorpionBot"
281                     "Scrubby"
282                     "Scumbot"
283                     "searchbot"
284                     "Seeker.lookseek.com"
285                     "SeznamBot"
286                     "ShowXML"
287                     "snap.com"
288                     "snap.com beta crawler"
289                     "Snapbot"
290                     "SnapPreviewBot"
291                     "sohu"
292                     "SpankBot"
293                     "Speedy Spider"
294                     "Speedy_Spider"
295                     "SpeedySpider"
296                     "spider"
297                     "SquigglebotBot"
298                     "SurveyBot"
299                     "SynapticSearch"
300                     "T-H-U-N-D-E-R-S-T-O-N-E"
301                     "Talkro Web-Shot"
302                     "Tarantula"
303                     "TerrawizBot"
304                     "TheInformant"
305                     "TMCrawler"
306                     "TridentSpider"
307                     "Tutorial Crawler"
308                     "Twiceler"
309                     "unwrapbot"
310                     "URI::Fetch"
311                     "VengaBot"
312                     "Vonna.com b o t"
313                     "Vortex"
314                     "Votay bot"
315                     "WebAlta Crawler"
316                     "Webbot"
317                     "Webclipping.com"
318                     "WebCorp"
319                     "Webinator"
320                     "WIRE"
321                     "WISEbot"
322                     "Xerka WebBot"
323                     "XSpider"
324                     "YodaoBot"
325                     "Yoono"
326                     "yoono"
327                 }
328             }
329             { allows V{ } }
330             { disallows V{ URL" /" } }
331             { unknowns H{ } }
332         }
333     }
334 } [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test