1 ! Copyright (C) 2014 John Benediktsson
2 ! See http://factorcode.org/license.txt for BSD license
4 USING: accessors combinators formatting io.encodings.ascii
5 io.files kernel literals math math.functions math.order
6 multiline regexp sequences sequences.extras sets splitting
13 : trimmed ( seq -- seq )
14 [ [ unicode:blank? ] trim ] map harvest ;
16 : split-paragraphs ( str -- seq )
17 R/ \r?\n\r?\n/ re-split trimmed ;
20 CONSTANT: ABBREVIATIONS {
21 "jr" "mr" "mrs" "ms" "dr" "prof" "sr" "sen" "rep" "rev"
22 "gov" "atty" "supt" "det" "rev" "col','gen" "lt" "cmdr"
23 "adm" "capt" "sgt" "cpl" "maj" ! titles
25 "dept" "univ" "uni" "assn" "bros" "inc" "ltd" "co" "corp"
28 "jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "oct"
29 "nov" "dec" "sept" ! months
31 "mon" "tue" "wed" "thu" "fri" "sat" "sun" ! days
33 "vs" "etc" "no" "esp" "cf" ! misc
35 "ave" "bld" "blvd" "cl" "ct" "cres" "dr" "rd" "st" ! streets
39 : split-sentences ( str -- seq )
41 ! Mark end of sentences with EOS marker
42 R/ ((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/
43 [ [ ".?!\r\n\"')]}" member? not ] cut-when "\x01" glue ]
47 $[ "(\\.\\.\\.*)\x01" <regexp> ] [ but-last-slice ]
52 "(?:\\s(?:(?:(?:\\w\\.){2,}\\w?)|(?:\\w\\.\\w)))\x01(\\s+[a-z0-9])"
54 ] [ [ 1 = ] cut-when append ] re-replace-with
58 ABBREVIATIONS "|" join "(" ")\\.\x01" surround
60 ] [ CHAR: . over index head ] re-replace-with
63 "\x01" split trimmed ;
65 CONSTANT: sub-syllable {
66 R/ [^aeiou]e$/ ! give, love, bone, done, ride ...
67 R/ [aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/
68 ! (passive) past participles and 3rd person sing present verbs:
69 ! bared, liked, called, tricked, bashed, matched
71 R/ .e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/
72 ! nominal, adjectival and adverbial derivatives from -e$ roots:
73 ! absolutely, nicely, likeness, basement, hopeless
74 ! hopeful, tastefully, wasteful
76 R/ ion/ ! action, diction, fiction
77 R/ [ct]ia[nl]/ ! special(ly), initial, physician, christian
78 R/ [^cx]iou/ ! illustrious, NOT spacious, gracious, anxious, noxious
79 R/ sia$/ ! amnesia, polynesia
80 R/ .gue$/ ! dialogue, intrigue, colleague
83 CONSTANT: add-syllable {
84 R/ i[aiou]/ ! alias, science, phobia
85 R/ [dls]ien/ ! salient, gradient, transient
86 R/ [aeiouym]ble$/ ! -Vble, plus -mble
87 R/ [aeiou]{3}/ ! agreeable
89 R/ ism$/ ! sexism, racism
90 R/ (?:([^aeiouy])\1|ck|mp|ng)le$/ ! bubble, cattle, cackle, sample, angle
92 R/ [aeiou]y[aeiou]/ ! annoying, layer
95 : syllables ( str -- n )
96 dup length 1 = [ drop 1 ] [
97 >lower CHAR: . swap remove
98 [ R/ [aeiouy]+/ count-matches ]
99 [ sub-syllable [ matches? ] with count - ]
100 [ add-syllable [ matches? ] with count + ] tri
104 : split-words ( str -- words )
105 R/ \b([a-z][a-z\-']*)\b/i all-matching-subseqs ;
107 TUPLE: text-analysis #paragraphs #sentences #chars #words
108 #syllables #complex-words #unique-words #difficult-words ;
110 : <text-analysis> ( str -- text-analysis )
112 [ split-paragraphs length ]
113 [ split-sentences length ]
114 [ [ unicode:blank? not ] count ]
120 [ syllables ] [ "-" member? not ] bi
121 over 2 > and 1 0 ? [ + ] bi-curry@ bi*
126 "vocab:text-analysis/dale-chall.txt" ascii
127 file-lines fast-set '[ >lower _ in? not ] count
129 } cleave text-analysis boa ;
131 : syllables-per-word ( text-analysis -- n )
132 [ #syllables>> ] [ #words>> ] bi / ;
134 : words-per-sentence ( text-analysis -- n )
135 [ #words>> ] [ #sentences>> ] bi / ;
137 : chars-per-word ( text-analysis -- n )
138 [ #chars>> ] [ #words>> ] bi / ;
140 : sentences-per-word ( text-analysis -- n )
141 [ #sentences>> ] [ #words>> ] bi / ;
143 : percent-complex-words ( text-analysis -- n )
144 [ #complex-words>> ] [ #words>> ] bi / 100 * ;
146 : percent-difficult-words ( text-analysis -- n )
147 [ #difficult-words>> ] [ #words>> ] bi / 100 * ;
149 : flesch-kincaid ( text-analysis -- n )
150 [ words-per-sentence 0.39 * ]
151 [ syllables-per-word 11.8 * ] bi + 15.59 - ;
153 : flesch ( text-analysis -- n )
155 [ words-per-sentence 1.015 * - ]
156 [ syllables-per-word 84.6 * - ] bi ;
158 : gunning-fog ( text-analysis -- n )
159 [ words-per-sentence ] [ percent-complex-words ] bi + 0.4 * ;
161 : coleman-liau ( text-analysis -- n )
162 [ chars-per-word 5.88 * ]
163 [ sentences-per-word 29.6 * ] bi - 15.8 - ;
165 : smog ( text-analysis -- n )
166 [ #complex-words>> ] [ #sentences>> 30 swap / ] bi *
167 sqrt 1.0430 * 3.1291 + ;
169 : automated-readability ( text-analysis -- n )
170 [ chars-per-word 4.71 * ]
171 [ words-per-sentence 0.5 * ] bi + 21.43 - ;
173 : dale-chall ( text-analysis -- n )
175 percent-difficult-words
176 [ 0.1579 * ] [ 0.05 > [ 3.6365 + ] when ] bi
178 [ words-per-sentence 0.0496 * ] bi + ;
180 STRING: text-report-format
181 Number of paragraphs %d
182 Number of sentences %d
184 Number of characters %d
186 Average words per sentence %.2f
187 Average syllables per word %.2f
189 Flesch Reading Ease %2.2f
190 Flesh-Kincaid Grade Level %2.2f
191 Gunning fog index %2.2f
192 Coleman–Liau index %2.2f
194 Automated Readability index %2.2f
195 Dale-Chall readability %2.2f
201 : analyze-text. ( str -- )
207 [ words-per-sentence ]
208 [ syllables-per-word ]
214 [ automated-readability ]
216 } cleave text-report-format printf ;