#lang racket/base
(require gregor
racket/match
racket/string
txexpr)
(provide ->text
first-words
ymd->english)
(module+ test (require rackunit))
;; Concatenate the string elements of a txexpr or list together
(define (->text v)
(match v
[(txexpr _ _ elements) (->text elements)]
[(list elements ...) (string-append* (map ->text elements))]
[(? string? s) s]
[_ ""]))
;; Return the first N words out of a list of txexprs. This function will unpack the strings out of
;; the elements of one txexpr at a time until it finds the requested number of words. It aims to be
;; both reliable and fast for any size of list you pass it, and smart about the punctuation it
;; allows through.
(define (first-words txprs words-needed)
(define punc-allowed-in-word '(#\- #\' #\% #\$ #\‘ #\’ #\# #\& #\/ #\. #\!))
(define (word-boundary? c) (or (char-whitespace? c) (equal? c #\null) (eof-object? c)))
(define (word-char? c) (or (char-alphabetic? c) (char-numeric? c)))
(define in (open-input-string (->text (car txprs))))
(define out (open-output-string))
(define words-found
(let loop ([words-found 0] [last-c #\null] [last-c-in-word? #f])
(define c (read-char in))
(cond [(equal? words-found words-needed) words-found]
[(eof-object? c)
(cond [(positive? words-found) (if last-c-in-word? (+ 1 words-found) words-found)]
[else 0])]
[else
(define-values (write-this-char? new-word-count c-in-word?)
(cond
;; Spaces increment the word count if the previous character was part of,
;; or adjacent to, a word
[(and (char-whitespace? c) last-c-in-word?)
(values (if (equal? words-needed (+ 1 words-found)) #f #t) (+ 1 words-found) #f)]
;; Some punctuation survives if the previous or next char is part of a word
[(member c punc-allowed-in-word)
(cond [(or (word-char? last-c) (word-char? (peek-char in)))
(values #t words-found #t)]
[else (values #f words-found #f)])]
[(word-char? c)
(values #t words-found #t)]
;; If c is a non-whitespace non-allowed character that immediately follows a word,
;; do not write it out but count it as being part of the word.
[(and (not (word-char? c)) (not (char-whitespace? c)) last-c-in-word?)
(values #f words-found #t)]
[else (values #f words-found #f)]))
(cond [(and (char-whitespace? c) write-this-char?) (write-char #\space out)]
[write-this-char? (write-char c out)])
(loop new-word-count c c-in-word?)])))
(define words (get-output-string out))
(cond [(equal? words-found words-needed) (string-append words "…")]
[(equal? '() (cdr txprs)) words]
[else (string-append words " " (first-words (cdr txprs) (- words-needed words-found)))]))
(module+ test
(require rackunit)
(define txs-decimals
'((p "Four score and 7.8 years ago — our fathers brought forth on this continent etc etc")))
(define txs-punc+split-elems
'((p "“Stop!” she called.") (p "(She was never one to be silent.)")))
(define txs-dashes
'((p [[class "newthought"]] (span [[class "smallcaps"]] "One - and") " only one.")
(p "That was all she would allow.")))
(define txs-parens-commas
'((p "She counted (" (em "one, two") "— silently, eyes unblinking")))
(define txs-short
'((span "Not much here!")))
(check-equal? (first-words txs-decimals 5) "Four score and 7.8 years…")
(check-equal? (first-words txs-punc+split-elems 5) "Stop! she called. She was…")
(check-equal? (first-words txs-dashes 5) "One and only one. That…")
(check-equal? (first-words txs-dashes 4) "One and only one.…")
(check-equal? (first-words txs-parens-commas 5) "She counted one two silently…")
(check-equal? (first-words txs-short 5) "Not much here!"))
;; ~~~ Convenience functions for YYYY-MM-DD date strings ~~~
;; These functions ignore everything after the first space in the input!
(define (ymd->dateformat ymd-string dateformat)
(~t (iso8601->date (car (string-split ymd-string))) dateformat))
(define (ymd->english ymd-string)
(ymd->dateformat ymd-string "MMMM d, yyyy"))
(module+ test
(check-equal? (ymd->english "2018-08-12") "August 12, 2018")
(check-equal? (ymd->dateformat "2018-08-12" "d MMM YYYY") "12 Aug 2018")
;; How we handle weird input
(check-equal? (ymd->english "2018-08-12 everything after 1st space ignored") "August 12, 2018")
(check-equal? (ymd->english "2018-08 omitting the day") "August 1, 2018")
(check-equal? (ymd->english "2018 omitting month and day") "January 1, 2018")
(check-equal? (ymd->dateformat "2018-08-12" "123") "123")
;; Stuff we just don't handle
(check-exn exn:gregor:parse? (lambda () (ymd->english "2018-xyz"))))