; ____ _ ____ _ _ _ _ ; / ___| ___ _ __ | |_ ___ _ __ ___ ___ / ___| _ __ | (_) |_| |_ ___ _ __ ; \___ \ / _ \ '_ \| __/ _ \ '_ \ / __/ _ \ \___ \| '_ \| | | __| __/ _ \ '__| ; ___) | __/ | | | || __/ | | | (_| __/ ___) | |_) | | | |_| || __/ | ; |____/ \___|_| |_|\__\___|_| |_|\___\___| |____/| .__/|_|_|\__|\__\___|_| ; |_| ; Ported from the Lingua::EN:Sentence Perl module ; (https://github.com/kimryan/Lingua-EN-Sentence) by David Ireland ; djireland79 at gmail dot com ; This module contains the procedure get-sentences, which splits text into its ; constituent sentences, based on a regular expressions and a list of ; abbreviations (built in and given). ; Certain well know exceptions, such as abbreviations, may cause incorrect ; segmentations. But some of them are already integrated into this code and are ; being taken care of. Still, if you see that there are words causing the ; get-sentences procedure to fail, you can add those to the module, so it notices them ; _ _ _ _ _ ; / \ | | __ _ ___ _ __(_) |_| |__ _ __ ___ ; / _ \ | |/ _` |/ _ \| '__| | __| '_ \| '_ ` _ \ ; / ___ \| | (_| | (_) | | | | |_| | | | | | | | | ; /_/ \_\_|\__, |\___/|_| |_|\__|_| |_|_| |_| |_| ; |___/ ; The algorithm uses a series of regular expressions to split the text into ; sentences. Possible locations for end-of-sentence are tagged. A set of rules ; decide when an end-of-sentence is justified and when it is a mistake. ; In case of a mistake, the end-of-sentence mark is removed. ; ; What are such mistakes? Cases of abbreviations, for example. I have a list of ; such abbreviations (Please see `Acronym/Abbreviations list' section), and more ; general rules (for example, the abbreviations 'i.e.' and '.e.g.' need not to be ; in the list as a special rule takes care of all single letter abbreviations). (module sentence-split (export get-sentences set-end-of-sentence-marker! end-of-sentence-marker add-acronyms! acronyms) (import data-structures chicken scheme) (use s regex) (define *EOS* "\001") ; End of sentence marker (define *P* "[\\.!\\?]") ; Punctuation (define *AP* "(?:'|\"|\\?|\\)|\\]|\\})?") ; After punctuation (define *PAP* (string-append *P* *AP*)) (define *PEOPLE* '("mr" "mrs" "ms" "dr" "prof" "mme" "ms?gr" "sens?" "reps?" "gov" "attys?" "supt" "insp" "const" "det" "pr" "revd?" "ald" "rt" "hon")) (define *TITLE-SUFFIXES* '("PhD" "jn?r" "sn?r" "esq" "md" "llb")) (define *MILITARY* '("col" "gen" "lt" "cdr" "cmdr" "adm" "capt" "sgt" "cpl" "maj" "pte")) (define *INSTITUTES* '("dept" "univ" "assn" "bros")) (define *COMPANIES* '("inc" "ltd" "co" "corp")) (define *PLACES* '("arc" "al" "ave" "blv?d" "cl" "ct" "cres" "dr" "expy?" "fw?y" "hwa?y" "la" "pde?" "pl" "plz" "rd" "st" "tce" "dist" "mt" "km" "in" "ft" "Ala" "Ariz" "Ark" "Cal" "Calif" "Col" "Colo" "Conn" "Del" "Fed" "Fla" "Ga" "Ida" "Id" "Ill" "Ind" "Ia" "Kan" "Kans" "Ken" "Ky" "La" "Me" "Md" "Is" "Mass" "Mich" "Minn" "Miss" "Mo" "Mont" "Neb" "Nebr" "Nev" "Mex" "Okla" "Ok" "Ore" "Penna" "Penn" "Pa" "Dak" "Tenn" "Tex" "Ut" "Vt" "Va" "Wash" "Wis" "Wisc" "Wy" "Wyo" "USAFA" "Alta" "Man" "Ont" "Qu?" "Sask" "Yuk" "Aust" "Vic" "Qld" "Tas")) (define *MONTHS* '("jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "sept" "oct" "nov" "dec")) (define *MISC* '("no" "esp" "est")) (define *LATIN* '("vs" "etc" "al" "ibid" "sic")) (define *ABBREVIATION-LIST* (append *PEOPLE* *TITLE-SUFFIXES* *MILITARY* *INSTITUTES* *COMPANIES* *PLACES* *MONTHS* *MISC* *LATIN*)) (define *ABBREVIATIONS* #f) (define (build-abbreviations!) (set! *ABBREVIATIONS* (regexp (string-append "(\\b(\\?:" (s-join "|" *ABBREVIATION-LIST*) ")" *PAP* "\\s)" *EOS*) 'IGNORECASE))) (define (str-subst-regex text regex) (string-substitute (car regex) (cdr regex) text 'global)) (define (first-sentence-breaking text) (define regexs `(("\\n\\s*\\n" . ,*EOS*) (,(string-append "(" *PAP* "\\s)") . ,(string-append "\\1" *EOS*)) (,(string-append "(\\s\\w" *P* ")") . ,(string-append "\\1" *EOS*)))) (foldl str-subst-regex text regexs)) (define (remove-false-end-of-sentence text) (define regexs `((,(string-append "([^-\\w]\\w" *PAP* "\\s)" *EOS*) . "\\1") (,(string-append "([^-\\w]\\w" *P* ")" *EOS*) . "\\1") ; Don't split after a white-space followed by a single letter or number followed ; by a dot followed by another whitespace, such as "Something. 1. point one" ; Note: will fail for 12. Point 12 (,(string-append "(\\s[\\w\\d]\\.\\s+)" *EOS*) . "\\1") ; Fix: bla bla... yada yada (,(string-append "(\\.\\.\\. )" *EOS* "([[:lower:]])") . "\\1\\2") ; Fix "." "?" "!" (,(string-append "([\'\"]" *AP* "['\"]\\s+)" *EOS*) . "\\1") ; Fix where abbreviations exist (,*ABBREVIATIONS* . "\\1") ; Don't break after quote unless its a capital letter. (,(string-append "([\"']\\s*)" *EOS* "(\\s*[[:lower:]])") . "\\1\\2") ; Don't break: text . . some more text. (,(string-append "(\\s\\.\\s)" *EOS* "(\\s*)") . "\\1\\2") (,(string-append "([\"']\\s*)" *EOS* "(\\s*[[:lower:]])") . "\\1\\2") (,(string-append "(\\s" *PAP* "\\s)" *EOS*) . "\\1"))) (foldl str-subst-regex text regexs)) (define (split-unsplit text) (define regexs `((,(string-append "([\\w" *P* "]\\d)(" *P* ")(\\s+)") . ,(string-append "\\1\\2" *EOS* "\\3")) (,(string-append "(" *PAP* "\\s)(\\s*\\()") . ,(string-append "\\1" *EOS* "\\2")) (,(string-append "('\\w" *P* ")(\\s)") . ,(string-append "\\1" *EOS* "\\2")) (,(string-append "(\\sno\\.)(\\s+)(\\?!\\d)") . ,(string-append "\\1" *EOS* "\\2")) (,(string-append "([ap]\\.m\\.\\s+)([[:upper:]])") . ,(string-append "\\1" *EOS* "\\2")))) ; Split where single capital letter followed by dot makes sense to break. ; notice these are exceptions to the general rule NOT to split on single ; letter. Notice also that single letter M is missing here, due to French 'mister' ; which is represented as M. ; The rule will not split on names begining or containing ; single capital letter dot in the first or second name ; assuming 2 or three word name. ; text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg; (foldl str-subst-regex text regexs )) (define (clean-sentences sentences) (define (replace-first text regex) (string-substitute (car regex) (cdr regex) text 1)) (define regexs `(("^\\s" . "") ("\\s$" . "") ("\\s+" . " "))) (map (lambda (s) (foldl replace-first s regexs)) sentences)) ;------------------------------------------------------------------------------ ; set-end-of-sentence-marker! - is used to set the end-of-sentence marker. Default is "\001" ;------------------------------------------------------------------------------ (define (set-end-of-sentence-marker! eos) (set! *EOS* eos)) (define (end-of-sentence-marker) *EOS*) ;------------------------------------------------------------------------------ ; add-acronyms! - is used for adding acronyms not supported by this code. ; The input should be regular expressions for matching the desired acronyms, ; but should not include the final period (C<.>). So, for example, C ; matches C and C. C will match C. You do not ; need to bother with acronyms consisting of single letters and dots ; (e.g. "U.S.A."), as these are found automatically. Note also that acronyms ; are searched for on a case insensitive basis. ;------------------------------------------------------------------------------ (define (add-acronyms! acronyms) (when (not (list? acronyms)) (error "acronyms must be a list")) (set! *ABBREVIATION-LIST* (append acronyms *ABBREVIATION-LIST*)) (build-abbreviations!)) (define (acronyms) *ABBREVIATION-LIST*) ;------------------------------------------------------------------------------ ; get-sentences - takes text input and splits it into sentences. ; A regular expression viciously cuts the text into sentences, ; and then a list of rules (some of them consist of a list of abbreviations) ; are applied on the marked text in order to fix end-of-sentence markings in ; places which are not indeed end-of-sentence. ;------------------------------------------------------------------------------ (define (get-sentences text) (when (not (string? text)) (error "text must be a string")) (let* ((text (first-sentence-breaking text)) (text (remove-false-end-of-sentence text)) (text (split-unsplit text))) (clean-sentences (string-split text *EOS*)))) ; Initilisation procedure (build-abbreviations!) ) ; End of Module