1 ! Copyright (C) 2016 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: arrays assocs combinators combinators.short-circuit
4 continuations fry io.encodings.utf8 io.files kernel locals make
5 math math.order modern.paths modern.slices sequences
6 sequences.extras sets splitting strings unicode vocabs.loader ;
9 ERROR: string-expected-got-eof n string ;
10 ERROR: long-opening-mismatch tag open n string ch ;
13 MACRO:: read-double-matched ( open-ch -- quot: ( n string tag ch -- n' string seq ) )
14 open-ch dup matching-delimiter {
15 [ drop 2 swap <string> ]
17 [ nip 2 swap <string> ]
18 } 2cleave :> ( openstr2 openstr1 closestr2 )
22 tag 1 cut-slice* drop tag! ! tag of (=( is ( here, fix it
23 n string openstr1 slice-til-separator-inclusive [ -1 modify-from ] dip :> ( n' string' opening ch )
24 ch open-ch = [ tag openstr2 n string ch long-opening-mismatch ] unless
25 opening matching-delimiter-string :> needle
27 n' string' needle slice-til-string :> ( n'' string'' payload closing )
29 tag opening payload closing 4array
32 tag 1 cut-slice* swap tag! 1 modify-to :> opening
33 n 1 + string closestr2 slice-til-string :> ( n' string' payload closing )
35 tag opening payload closing 4array
37 [ [ tag openstr2 n string ] dip long-opening-mismatch ]
41 : read-double-matched-paren ( n string tag ch -- n' string seq ) CHAR: ( read-double-matched ;
42 : read-double-matched-bracket ( n string tag ch -- n' string seq ) CHAR: [ read-double-matched ;
43 : read-double-matched-brace ( n string tag ch -- n' string seq ) CHAR: { read-double-matched ;
47 ERROR: lex-expected-but-got-eof n string expected ;
48 ! For implementing [ { (
49 : lex-until ( n string tag-sequence -- n' string payload )
52 lex-factor-top dup f like [ , ] when* [
54 ! } gets a chance, but then also full seq { } after recursion...
55 [ _ ] dip '[ _ sequence= ] any? not
60 _ _ _ lex-expected-but-got-eof
67 DEFER: lex-factor-nested
68 : lex-colon-until ( n string tag-sequence -- n' string payload )
71 lex-factor-nested dup f like [ , ] when* [
73 ! This is for ending COLON: forms like ``A: PRIVATE>``
77 ! } gets a chance, but then also full seq { } after recursion...
78 [ _ ] dip '[ _ sequence= ] any? not
89 : split-double-dash ( seq -- seqs )
90 dup [ { [ "--" sequence= ] } 1&& ] split-when
91 dup length 1 > [ nip ] [ drop ] if ;
93 MACRO:: read-matched ( ch -- quot: ( n string tag -- n' string slice' ) )
94 ch dup matching-delimiter {
95 [ drop "=" swap prefix ]
97 } 2cleave :> ( openstreq closestr1 ) ! [= ]
100 2over nth-check-eof {
101 { [ dup openstreq member? ] [ ch read-double-matched ] } ! (=( or ((
103 drop dup '[ _ matching-delimiter-string closestr1 2array members lex-until ] dip
104 swap unclip-last 3array ] } ! ( foo )
105 [ drop [ slice-til-whitespace drop ] dip span-slices ] ! (foo)
109 : read-bracket ( n string slice -- n' string slice' ) CHAR: [ read-matched ;
110 : read-brace ( n string slice -- n' string slice' ) CHAR: { read-matched ;
111 : read-paren ( n string slice -- n' string slice' ) CHAR: ( read-matched ;
112 : read-string-payload ( n string -- n' string )
114 { CHAR: \\ CHAR: \" } slice-til-separator-inclusive {
116 { CHAR: \" [ drop ] }
117 { CHAR: \\ [ drop next-char-from drop read-string-payload ] }
120 string-expected-got-eof
123 :: read-string ( n string tag -- n' string seq )
124 n string read-string-payload drop :> n'
126 n' [ n string string-expected-got-eof ] unless
127 n n' 1 - string <slice>
128 n' 1 - n' string <slice>
131 : take-comment ( n string slice -- n' string comment )
132 2over ?nth CHAR: [ = [
133 [ 1 + ] 2dip 2over ?nth read-double-matched-bracket
135 [ slice-til-eol drop ] dip swap 2array
138 : terminator? ( slice -- ? )
146 ERROR: expected-length-tokens n string length seq ;
147 : ensure-no-false ( n string seq -- n string seq )
148 dup [ length 0 > ] all? [ [ length ] keep expected-length-tokens ] unless ;
150 ERROR: token-expected n string obj ;
151 ERROR: unexpected-terminator n string slice ;
152 : read-lowercase-colon ( n string slice -- n' string lowercase-colon )
153 dup [ CHAR: : = ] count-tail
155 _ [ lex-factor ] replicate ensure-no-false dup [ token-expected ] unless
156 dup terminator? [ unexpected-terminator ] when
159 : (strict-upper?) ( string -- ? )
164 { [ CHAR: A CHAR: Z between? ] [ "':-\\#" member? ] } 1||
167 ! At least one char must...
168 [ [ { [ CHAR: A CHAR: Z between? ] [ CHAR: ' = ] } 1|| ] any? ]
171 : strict-upper? ( string -- ? )
172 { [ ":" sequence= ] [ (strict-upper?) ] } 1|| ;
175 : section-open? ( string -- ? )
179 [ rest strict-upper? ]
183 : html-self-close? ( string -- ? )
187 [ rest strict-upper? not ]
188 [ [ blank? ] any? not ]
192 : html-full-open? ( string -- ? )
196 [ second CHAR: / = not ]
197 [ rest strict-upper? not ]
198 [ [ blank? ] any? not ]
202 : html-half-open? ( string -- ? )
206 [ second CHAR: / = not ]
207 [ rest strict-upper? not ]
208 [ [ blank? ] any? not ]
212 : html-close? ( string -- ? )
216 [ rest strict-upper? not ]
217 [ [ blank? ] any? not ]
221 : special-acute? ( string -- ? )
230 : upper-colon? ( string -- ? )
231 dup { [ length 0 > ] [ [ CHAR: : = ] all? ] } 1&& [
236 [ "\\" head? not ] ! XXX: good?
238 [ dup [ CHAR: : = ] find drop head strict-upper? ]
242 : section-close? ( string -- ? )
245 [ "\\" head? not ] ! XXX: good?
249 [ but-last strict-upper? ]
250 [ { [ ";" head? ] [ rest but-last strict-upper? ] } 1&& ]
255 : read-til-semicolon ( n string slice -- n' string semi )
256 dup '[ but-last ";" append ";" 2array { "--" ")" } append lex-colon-until ] dip
258 ! What ended the FOO: .. ; form?
259 ! Remove the ; from the payload if present
260 ! XXX: probably can remove this, T: is dumb
261 ! Also in stack effects ( T: int -- ) can be ended by -- and )
263 { [ dup ";" sequence= ] [ drop unclip-last 3array ] }
264 { [ dup ";" tail? ] [ drop unclip-last 3array ] }
265 { [ dup "--" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
266 { [ dup "]" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
267 { [ dup "}" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
268 { [ dup ")" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } ! (n*quot) breaks
269 { [ dup section-close? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
270 { [ dup upper-colon? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
274 ERROR: colon-word-must-be-all-uppercase-or-lowercase n string word ;
275 : read-colon ( n string slice -- n' string colon )
277 { [ dup strict-upper? ] [ read-til-semicolon ] }
278 { [ dup ":" tail? ] [ dup ":" head? [ read-lowercase-colon ] unless ] } ! :foo: vs foo:
282 : read-acute-html ( n string slice -- n' string acute )
285 { [ dup section-open? ] [
287 matching-section-delimiter 1array lex-until
288 ] keep swap unclip-last 3array
291 { [ dup html-self-close? ] [
295 { [ dup html-full-open? ] [
298 dup ">" tail? [ but-last-slice ] when
299 "</" ">" surround 1array lex-until unclip-last
303 { [ dup html-half-open? ] [
305 [ { ">" "/>" } lex-until ] dip
308 "</" ">" surround array '[ _ lex-until ] dip unclip-last
309 -rot roll unclip-last [ 3array ] 2dip 3array
312 swap unclip-last 3array
316 { [ dup html-close? ] [
319 [ [ slice-til-whitespace drop ] dip span-slices ]
322 : read-acute ( n string slice -- n' string acute )
323 [ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array ;
325 ! Words like append! and suffix! are allowed for now.
326 : read-exclamation ( n string slice -- n' string obj )
327 dup { [ "!" sequence= ] [ "#!" sequence= ] } 1||
328 [ take-comment ] [ merge-slice-til-whitespace ] if ;
330 ERROR: no-backslash-payload n string slice ;
331 : (read-backslash) ( n string slice -- n' string obj )
332 merge-slice-til-whitespace dup "\\" tail? [
334 dup [ CHAR: \\ = ] count-tail
336 _ [ skip-blank-from slice-til-whitespace drop ] replicate
338 dup [ no-backslash-payload ] unless
342 DEFER: lex-factor-top*
343 : read-backslash ( n string slice -- n' string obj )
344 ! foo\ so far, could be foo\bar{
345 ! remove the \ and continue til delimiter/eof
346 [ "\"!:[{(<>\s\r\n" slice-til-either ] dip swap [ span-slices ] dip
350 dup [ CHAR: \\ = ] all? [ (read-backslash) ] [ merge-slice-til-whitespace ] if
352 ! foo\ or foo\bar (?)
353 over "\\" tail? [ drop (read-backslash) ] [ lex-factor-top* ] if
356 ! If the slice is 0 width, we stopped on whitespace.
357 ! Advance the index and read again!
359 : read-token-or-whitespace-top ( n string slice -- n' string slice/f )
360 dup length 0 = [ [ 1 + ] 2dip drop lex-factor-top ] when ;
362 : read-token-or-whitespace-nested ( n string slice -- n' string slice/f )
363 dup length 0 = [ [ 1 + ] 2dip drop lex-factor-nested ] when ;
365 : lex-factor-fallthrough ( n/f string slice/f ch/f -- n'/f string literal )
367 { CHAR: \ [ read-backslash ] }
368 { CHAR: [ [ read-bracket ] }
369 { CHAR: { [ read-brace ] }
370 { CHAR: ( [ read-paren ] }
374 { CHAR: " [ read-string ] }
375 { CHAR: ! [ read-exclamation ] }
377 [ [ CHAR: > = not ] slice-until ] dip merge-slices
379 [ slice-til-whitespace drop ] dip ?span-slices
385 ! Inside a FOO: or a <FOO FOO>
386 : lex-factor-nested* ( n/f string slice/f ch/f -- n'/f string literal )
388 ! Nested ``A: a B: b`` so rewind and let the parser get it top-level
390 ! A: B: then interrupt the current parser
391 ! A: b: then keep going
392 merge-slice-til-whitespace
393 dup { [ upper-colon? ] [ ":" = ] } 1||
399 ! FOO: a b <BAR: ;BAR>
402 ! FOO: a b <asdf asdf>
404 ! if we are in a FOO: and we hit a <BAR or <BAR:
406 ! Don't rewind for a <foo/> or <foo></foo>
407 [ slice-til-whitespace drop ] dip span-slices
408 dup section-open? [ rewind-slice f ] when
410 { CHAR: \s [ read-token-or-whitespace-nested ] }
411 { CHAR: \r [ read-token-or-whitespace-nested ] }
412 { CHAR: \n [ read-token-or-whitespace-nested ] }
413 [ lex-factor-fallthrough ]
416 : lex-factor-nested ( n/f string -- n'/f string literal )
418 "\"\\!:[{(]})<>\s\r\n" slice-til-either
419 lex-factor-nested* ; inline
421 : lex-factor-top* ( n/f string slice/f ch/f -- n'/f string literal )
423 { CHAR: : [ merge-slice-til-whitespace read-colon ] }
425 ! FOO: a b <BAR: ;BAR>
428 ! FOO: a b <asdf asdf>
430 ! if we are in a FOO: and we hit a <BAR or <BAR:
432 [ slice-til-whitespace drop ] dip span-slices
434 dup section-open? [ read-acute ] when
437 { CHAR: \s [ read-token-or-whitespace-top ] }
438 { CHAR: \r [ read-token-or-whitespace-top ] }
439 { CHAR: \n [ read-token-or-whitespace-top ] }
440 [ lex-factor-fallthrough ]
443 : lex-factor-top ( n/f string -- n'/f string literal )
445 "\"\\!:[{(]})<>\s\r\n" slice-til-either
446 lex-factor-top* ; inline
448 ERROR: compound-syntax-disallowed n seq obj ;
449 : check-for-compound-syntax ( n/f seq obj -- n/f seq obj )
450 dup length 1 > [ compound-syntax-disallowed ] when ;
452 : check-compound-loop ( n/f string -- n/f string ? )
453 [ ] [ peek-from ] [ previous-from ] 2tri
454 [ blank? ] bi@ or not ! no blanks between tokens
455 pick and ; ! and a valid index
457 : lex-factor ( n/f string/f -- n'/f string literal/f )
459 ! Compound syntax loop
461 lex-factor-top f like [ , ] when*
462 ! concatenated syntax ( a )[ a 1 + ]( b )
466 check-for-compound-syntax
467 ! concat ! "ALIAS: n*quot (n*quot)" string>literals ... breaks here
470 : string>literals ( string -- sequence )
472 [ lex-factor [ , ] when* over ] loop
475 : vocab>literals ( vocab -- sequence )
476 ".private" ?tail drop
477 vocab-source-path utf8 file-contents string>literals ;
479 : path>literals ( path -- sequence )
480 utf8 file-contents string>literals ;
482 : lex-paths ( vocabs -- assoc )
483 [ [ path>literals ] [ nip ] recover ] map-zip ;
485 : lex-vocabs ( vocabs -- assoc )
486 [ [ vocab>literals ] [ nip ] recover ] map-zip ;
488 : failed-lexing ( assoc -- assoc' ) [ nip array? ] assoc-reject ;
490 : lex-core ( -- assoc ) core-vocabs lex-vocabs ;
491 : lex-basis ( -- assoc ) basis-vocabs lex-vocabs ;
492 : lex-extra ( -- assoc ) extra-vocabs lex-vocabs ;
493 : lex-roots ( -- assoc ) lex-core lex-basis lex-extra 3append ;
495 : lex-docs ( -- assoc ) all-docs-paths lex-paths ;
496 : lex-tests ( -- assoc ) all-tests-paths lex-paths ;
498 : lex-all ( -- assoc )
499 lex-roots lex-docs lex-tests 3append ;