◊(Local Yarn Code "Artifact [73b950a7]")

Artifact 73b950a715298f252c10a5646df6238ef551c24c200a4fdfe564d0971caed716:


#lang racket/base

(require gregor
         racket/match
         racket/string
         txexpr)

(provide ->text
         first-words
         ymd->english)

(module+ test (require rackunit))

;; Concatenate the string elements of a txexpr or list together
(define (->text v)
  (match v
    [(txexpr _ _ elements) (->text elements)]
    [(list elements ...) (string-append* (map ->text elements))]
    [(? string? s) s]
    [_ ""]))

;; Return the first N words out of a list of txexprs. This function will unpack the strings out of
;; the elements of one txexpr at a time until it finds the requested number of words. It aims to be
;; both reliable and fast for any size of list you pass it, and smart about the punctuation it
;; allows through.
(define (first-words txprs words-needed)
  (define punc-allowed-in-word '(#\- #\' #\% #\$ #\‘ #\’ #\# #\& #\/ #\. #\!))
  
  (define (word-boundary? c) (or (char-whitespace? c) (equal? c #\null) (eof-object? c)))
  (define (word-char? c) (or (char-alphabetic? c) (char-numeric? c)))
  
  (define in (open-input-string (->text (car txprs))))
  (define out (open-output-string))
  
  (define words-found
    (let loop ([words-found 0] [last-c #\null] [last-c-in-word? #f])
      (define c (read-char in))

      (cond [(equal? words-found words-needed) words-found]
            [(eof-object? c)
             (cond [(positive? words-found) (if last-c-in-word? (+ 1 words-found) words-found)]
                   [else 0])]
            [else
             (define-values (write-this-char? new-word-count c-in-word?)
               (cond
                 ;; Spaces increment the word count if the previous character was part of,
                 ;; or adjacent to, a word
                 [(and (char-whitespace? c) last-c-in-word?)
                  (values (if (equal? words-needed (+ 1 words-found)) #f #t) (+ 1 words-found) #f)]
                 ;; Some punctuation survives if the previous or next char is part of a word
                 [(member c punc-allowed-in-word)
                  (cond [(or (word-char? last-c) (word-char? (peek-char in)))
                         (values #t words-found #t)]
                        [else (values #f words-found #f)])]
                 [(word-char? c)
                  (values #t words-found #t)]
                 ;; If c is a non-whitespace non-allowed character that immediately follows a word,
                 ;; do not write it out but count it as being part of the word.   
                 [(and (not (word-char? c)) (not (char-whitespace? c)) last-c-in-word?)
                  (values #f words-found #t)]
                 [else (values #f words-found #f)]))

             (cond [(and (char-whitespace? c) write-this-char?) (write-char #\space out)]
                   [write-this-char? (write-char c out)])
             (loop new-word-count c c-in-word?)])))
  
  (define words (get-output-string out))
  (cond [(equal? words-found words-needed) (string-append words "…")]
        [(equal? '() (cdr txprs)) words]
        [else (string-append words " " (first-words (cdr txprs) (- words-needed words-found)))]))

(module+ test
  (require rackunit)
  (define txs-decimals
    '((p "Four score and 7.8 years ago — our fathers brought forth on this continent etc etc")))
  (define txs-punc+split-elems
    '((p "“Stop!” she called.") (p "(She was never one to be silent.)")))
  (define txs-dashes
    '((p [[class "newthought"]] (span [[class "smallcaps"]] "One - and") " only one.")
      (p "That was all she would allow.")))
  (define txs-parens-commas
    '((p "She counted (" (em "one, two") "— silently, eyes unblinking")))
  (define txs-short
    '((span "Not much here!")))

  (check-equal? (first-words txs-decimals 5) "Four score and 7.8 years…")
  (check-equal? (first-words txs-punc+split-elems 5) "Stop! she called. She was…")
  (check-equal? (first-words txs-dashes 5) "One and only one. That…")
  (check-equal? (first-words txs-dashes 4) "One and only one.…")
  (check-equal? (first-words txs-parens-commas 5) "She counted one two silently…")
  (check-equal? (first-words txs-short 5) "Not much here!"))

;; ~~~ Convenience functions for YYYY-MM-DD date strings ~~~

;; These functions ignore everything after the first space in the input!
(define (ymd->dateformat ymd-string dateformat)
  (~t (iso8601->date (car (string-split ymd-string))) dateformat))

(define (ymd->english ymd-string)
  (ymd->dateformat ymd-string "MMMM d, yyyy"))

(module+ test
  (check-equal? (ymd->english "2018-08-12") "August 12, 2018")
  (check-equal? (ymd->dateformat "2018-08-12" "d MMM YYYY") "12 Aug 2018")

  ;; How we handle weird input
  (check-equal? (ymd->english "2018-08-12 everything after 1st space ignored") "August 12, 2018")
  (check-equal? (ymd->english "2018-08 omitting the day") "August 1, 2018")
  (check-equal? (ymd->english "2018 omitting month and day") "January 1, 2018")
  (check-equal? (ymd->dateformat "2018-08-12" "123") "123")

  ;; Stuff we just don't handle
  (check-exn exn:gregor:parse? (lambda () (ymd->english "2018-xyz"))))