;   ____             _                        ____        _ _ _   _            
;  / ___|  ___ _ __ | |_ ___ _ __   ___ ___  / ___| _ __ | (_) |_| |_ ___ _ __ 
;  \___ \ / _ \ '_ \| __/ _ \ '_ \ / __/ _ \ \___ \| '_ \| | | __| __/ _ \ '__|
;   ___) |  __/ | | | ||  __/ | | | (_|  __/  ___) | |_) | | | |_| ||  __/ |   
;  |____/ \___|_| |_|\__\___|_| |_|\___\___| |____/| .__/|_|_|\__|\__\___|_|   
;                                                  |_|                         

; Ported from the Lingua::EN:Sentence Perl module 
; (https://github.com/kimryan/Lingua-EN-Sentence) by David Ireland  
; djireland79 at gmail dot com

; This module contains the procedure get-sentences, which splits text into its
; constituent sentences, based on a regular expressions and a list of 
; abbreviations (built in and given).

; Certain well know exceptions, such as abbreviations, may cause incorrect
; segmentations. But some of them are already integrated into this code and are
; being taken care of. Still, if you see that there are words causing the
; get-sentences procedure to fail, you can add those to the module, so it notices them


;                   _    _                  _ _   _               
;                  / \  | | __ _  ___  _ __(_) |_| |__  _ __ ___  
;                 / _ \ | |/ _` |/ _ \| '__| | __| '_ \| '_ ` _ \ 
;                / ___ \| | (_| | (_) | |  | | |_| | | | | | | | |
;               /_/   \_\_|\__, |\___/|_|  |_|\__|_| |_|_| |_| |_|
;                          |___/                                  


; The algorithm uses a series of regular expressions to split the text into 
; sentences. Possible locations for end-of-sentence are tagged. A set of rules
; decide when an end-of-sentence is justified and when it is a mistake. 
; In case of a mistake, the end-of-sentence mark is removed.
;
; What are such mistakes? Cases of abbreviations, for example. I have a list of
; such abbreviations (Please see `Acronym/Abbreviations list' section), and more
; general rules (for example, the abbreviations 'i.e.' and '.e.g.' need not to be
; in the list as a special rule takes care of all single letter abbreviations).

(module sentence-split
    (export  get-sentences
             set-end-of-sentence-marker! 
             end-of-sentence-marker
             add-acronyms!
             acronyms)
    (import data-structures chicken scheme)
    (use s regex)

(define *EOS* "\001")                      ; End of sentence marker
(define *P*   "[\\.!\\?]")                 ; Punctuation
(define *AP*  "(?:'|\"|\\?|\\)|\\]|\\})?") ; After punctuation
(define *PAP* (string-append *P* *AP*))

(define *PEOPLE* '("mr" "mrs" "ms" "dr" "prof" "mme" "ms?gr" "sens?"
                   "reps?" "gov" "attys?" "supt" "insp" "const" "det"
                   "pr" "revd?" "ald" "rt" "hon"))

(define *TITLE-SUFFIXES* '("PhD" "jn?r" "sn?r" "esq" "md" "llb"))

(define *MILITARY* '("col" "gen" "lt" "cdr" "cmdr" "adm" "capt"
                     "sgt" "cpl" "maj" "pte"))

(define *INSTITUTES* '("dept" "univ" "assn" "bros"))

(define *COMPANIES* '("inc" "ltd" "co" "corp"))

(define *PLACES* '("arc" "al" "ave" "blv?d" "cl" "ct" "cres" "dr" "expy?"
                   "fw?y" "hwa?y" "la" "pde?" "pl" "plz" "rd" "st" "tce"
                   "dist" "mt" "km" "in" "ft" "Ala" "Ariz" "Ark" "Cal"
                   "Calif" "Col" "Colo" "Conn" "Del" "Fed" "Fla" "Ga"
                   "Ida" "Id" "Ill" "Ind" "Ia" "Kan" "Kans" "Ken" "Ky"
                   "La" "Me" "Md" "Is" "Mass" "Mich" "Minn" "Miss" "Mo"
                   "Mont" "Neb" "Nebr" "Nev" "Mex" "Okla" "Ok" "Ore" 
                   "Penna" "Penn" "Pa" "Dak" "Tenn" "Tex" "Ut" "Vt" "Va"
                   "Wash" "Wis" "Wisc" "Wy" "Wyo" "USAFA" "Alta" "Man"
                   "Ont" "Qu?" "Sask" "Yuk" "Aust" "Vic" "Qld" "Tas"))

(define *MONTHS* '("jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep"
                   "sept" "oct" "nov" "dec"))

(define *MISC*  '("no" "esp" "est")) 

(define *LATIN* '("vs" "etc" "al" "ibid" "sic"))

(define *ABBREVIATION-LIST* (append *PEOPLE*
                                    *TITLE-SUFFIXES*
                                    *MILITARY*
                                    *INSTITUTES*
                                    *COMPANIES*
                                    *PLACES*
                                    *MONTHS*
                                    *MISC*
                                    *LATIN*))

(define *ABBREVIATIONS* #f)

(define (build-abbreviations!)
  (set! *ABBREVIATIONS* (regexp
                          (string-append "(\\b(\\?:"
                                         (s-join "|" *ABBREVIATION-LIST*)
                                         ")"
                                         *PAP*
                                         "\\s)"
                                         *EOS*) 'IGNORECASE)))

(define (str-subst-regex text regex)
  (string-substitute (car regex) (cdr regex) text 'global))

(define (first-sentence-breaking text)
 (define regexs 
   `(("\\n\\s*\\n" . ,*EOS*) 
     (,(string-append "(" *PAP* "\\s)")  . ,(string-append "\\1" *EOS*))
     (,(string-append "(\\s\\w" *P* ")") . ,(string-append "\\1" *EOS*))))

   (foldl str-subst-regex text regexs))

(define (remove-false-end-of-sentence text)
    (define regexs
      `((,(string-append "([^-\\w]\\w" *PAP* "\\s)" *EOS*) . "\\1")
        (,(string-append "([^-\\w]\\w" *P* ")" *EOS*)      . "\\1")

        ; Don't split after a white-space followed by a single letter or number followed
        ; by a dot followed by another whitespace, such as "Something. 1. point one"
        ; Note: will fail for 12. Point 12
        (,(string-append "(\\s[\\w\\d]\\.\\s+)" *EOS*) . "\\1")

        ; Fix: bla bla... yada yada
        (,(string-append "(\\.\\.\\. )" *EOS* "([[:lower:]])") . "\\1\\2")
        
        ; Fix "." "?" "!"
        (,(string-append "([\'\"]" *AP* "['\"]\\s+)" *EOS*) . "\\1")
        
        ; Fix where abbreviations exist
        (,*ABBREVIATIONS* . "\\1")
    
        ; Don't break after quote unless its a capital letter.
        (,(string-append "([\"']\\s*)" *EOS* "(\\s*[[:lower:]])") . "\\1\\2")

        ; Don't break: text . . some more text.
        (,(string-append "(\\s\\.\\s)" *EOS* "(\\s*)") . "\\1\\2")
    
        (,(string-append "([\"']\\s*)" *EOS* "(\\s*[[:lower:]])") . "\\1\\2")
        (,(string-append "(\\s" *PAP* "\\s)" *EOS*) . "\\1")))

    (foldl str-subst-regex text regexs))

(define (split-unsplit text)
  (define regexs
    `((,(string-append "([\\w" *P* "]\\d)(" *P* ")(\\s+)") . ,(string-append "\\1\\2" *EOS* "\\3"))
      (,(string-append "(" *PAP* "\\s)(\\s*\\()")          . ,(string-append "\\1"    *EOS* "\\2"))
      (,(string-append "('\\w" *P* ")(\\s)")               . ,(string-append "\\1"    *EOS* "\\2"))
      (,(string-append "(\\sno\\.)(\\s+)(\\?!\\d)")        . ,(string-append "\\1"    *EOS* "\\2"))
      (,(string-append "([ap]\\.m\\.\\s+)([[:upper:]])")   . ,(string-append "\\1"    *EOS* "\\2"))))

  ;  Split where single capital letter followed by dot makes sense to break.
  ;  notice these are exceptions to the general rule NOT to split on single
  ;  letter. Notice also that single letter M is missing here, due to French 'mister'
  ;  which is represented as M.
  
  ;  The rule will not split on names begining or containing
  ;  single capital letter dot in the first or second name
  ;  assuming 2 or three word name.
  ;  text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg;
  (foldl str-subst-regex text regexs ))

(define (clean-sentences sentences)
  (define (replace-first text regex)
    (string-substitute (car regex) (cdr regex) text 1))
    
  (define regexs `(("^\\s" . "") 
                   ("\\s$" . "") 
                   ("\\s+"  . " ")))
  (map
    (lambda (s) 
      (foldl replace-first s regexs))
    sentences))


;------------------------------------------------------------------------------
; set-end-of-sentence-marker! - is used to set the end-of-sentence marker. Default is "\001"
;------------------------------------------------------------------------------
(define (set-end-of-sentence-marker! eos)
  (set! *EOS* eos))

(define (end-of-sentence-marker)
  *EOS*)

;------------------------------------------------------------------------------
; add-acronyms! - is used for adding acronyms not supported by this code. 
; The input should be regular expressions for matching the desired acronyms,
; but should not include the final period (C<.>). So, for example, C<blv?d>
; matches C<blvd.> and C<bld.>. C<a\.mlf> will match C<a.mlf.>. You do not
; need to bother with acronyms consisting of single letters and dots
; (e.g. "U.S.A."), as these are found automatically. Note also that acronyms
; are searched for on a case insensitive basis.
;------------------------------------------------------------------------------
(define (add-acronyms! acronyms)
  (when (not (list? acronyms))
    (error "acronyms must be a list"))
  (set! *ABBREVIATION-LIST* (append acronyms *ABBREVIATION-LIST*))
  (build-abbreviations!))

(define (acronyms)
  *ABBREVIATION-LIST*)

;------------------------------------------------------------------------------
; get-sentences - takes text input and splits it into sentences.
; A regular expression viciously cuts the text into sentences,
; and then a list of rules (some of them consist of a list of abbreviations)
; are applied on the marked text in order to fix end-of-sentence markings in
; places which are not indeed end-of-sentence.
;------------------------------------------------------------------------------
(define (get-sentences text)
  (when (not  (string? text))
    (error "text must be a string"))

  (let* ((text (first-sentence-breaking text))
         (text (remove-false-end-of-sentence text))
         (text (split-unsplit text)))
    (clean-sentences
      (string-split text *EOS*))))


; Initilisation procedure
(build-abbreviations!)

) ; End of Module