Skip to content

Soundex in Scheme

September 21, 2008

Have a look at my implementation of Soundex in Scheme! You need the Spark Scheme interpreter to run this, as we use some Spark extensions. It can be run on other Schemes as well, by making minor modifications.

This is how the program works: It reads a file of English words and builds a Soundex database. This is a one-time process. Then it enters a prompt where the user can type-in a word. If that word is not in the dictionary, a list of possible suggestions are given back to the user. Here is a sample session:

> property
> propritory
(proprietary proprietor)

There are three variables to control the behavior of the spell checker:

  1. dict-file. The complete or relative path of the “words” file. This should contain a list of words separated by “\r\n”.
  2. match-range. This decides how close the suggestions should be to the typo. This defaults to “0.5”. A lesser value will give more words to choose from and a higher value will give only those suggestions that closely match the wrong word. For instance, if you change this value to .3, the above given typo “propritory” will give you the suggestions: “(proprietary proprietor preparatory)”.
  3. show-all-suggestions. If this is true (#t) all possible suggestions are printed. match-range is ignored.

Now, here is the complete source of our spell checker:

;; You can customize the program by changing the values of these three variables:
(define dict-file "words")
(define match-range .5)
(define show-all-suggestions #f)
;; :~

;; The table used to create soundex constants.
(define consonents null)

(define (init-consonents)
  (set! consonents (make-hash-table))
  (let* ((c #(#\b #\f #\p #\v
          #\c #\g #\j #\k #\q #\s #\x #\z
          #\d #\t
          #\m #\n
     (d #(1 1 1 1
        2 2 2 2 2 2 2 2
        3 3
        5 5
     (c-len (vector-length c))
     (i 0))
    (while (< i c-len)
       (hash-table-put! consonents (vector-ref c i) (vector-ref d i))
       (set! i (add1 i)))))       

;; Returns the soundex code of a word.
;; The optional argument decides whether to retrun
;; the full code or truncate it to 3 digits.
(define (soundex word . args)
  (if (null? consonents)
  (set! word (string-downcase word))
  (let ((len (string-length word))
    (sdx (list)) (i 1) (prev-digit 0)
    (curr-digit 0) (cut #t))
    (if (not (null? args))
    (set! cut (car args)))
    (set! sdx (append sdx (list (string-ref word 0))))
    (while (< i len)
       (set! curr-digit (hash-table-get consonents (string-ref word i) null))
       (if (and (not (null? curr-digit))
            (not (= curr-digit prev-digit)))
         (set! sdx (append sdx (list curr-digit)))
         (set! prev-digit curr-digit)))
       (set! i (add1 i))
       (if cut
         (if (not (< (length sdx) 4))
             (set! i len)))))
    (while (< (length sdx) 4)
       (set! sdx (append sdx (list 0))))

;; Reads the words file to memory.
;; Note: This implementation is not very efficient.
(define (load-words)
  (let ((words (list))
    (flen (file-size dict-file)))
    (call-with-input-file dict-file
      (lambda (f)
    (let ((line (read-line f 'return-linefeed)))
      (while (not (eof-object? line))
         (if (symbol? line)
             (set! line (symbol->string line)))
         (if (string? line)
             (set! words (append words (list line))))
         (set! line (read-line f 'return-linefeed))))))

;; Groups words to their soundex code using an association list.
(define soundex-dict (list))

;; The soundex database filename. It is created in the
;; current path by appending the extension ".sdx" to the
;; words file name.
(define (get-soundex-file-name)
  (string-append (path->string (file-name-from-path dict-file)) ".sdx"))

;; Saves the soundex codes hash-map to the .sdx file.
;; Next time the program starts up fast as this
;; file is directly read into memory. If this file
;; exists, the the "words" file is ignored.
(define (save-soundex-dict)
  (let ((file-name (get-soundex-file-name)))
    (call-with-output-file file-name
      (lambda (f)
    (write soundex-dict f)))))

;; Load the .sdx file.
(define (load-soundex-dict)
  (let ((file-name (get-soundex-file-name)))
    (if (file-exists? file-name)
      (call-with-input-file file-name
        (lambda (f)
          (set! soundex-dict (read f))))

;; Creates the soundex association list.
(define (make-soundex-dict)
  (if (not (load-soundex-dict))
    (printf "Creating word patterns. This one time process might take a few minutes.~n")
    (let ((words (load-words))
          (sdx null) (wlist null)
          (w null))
      (while (not (null? words))
         (set! w (car words))
         (set! sdx (soundex w #f))
         (set! wlist (assoc sdx soundex-dict))
         (if (eq? wlist #f)
              (set! wlist (list w))
            (set! soundex-dict (remove soundex-dict (assoc sdx soundex-dict)))
            (set! wlist (cdr wlist))
            (set! wlist (append wlist (list w)))))
         (set! soundex-dict (append soundex-dict (list (cons sdx wlist))))
         (set! words (cdr words))))

;; Returns the number of common characters
;; in the strings w1 and w2.
(define (count-shared-chars w1 w2)
  (let* ((w1-len (string-length w1))
     (w2-len (string-length w2))
     (min-len (min w1-len w2-len))
     (w null) (c 0) (wlst null)
     (count 0) (found (list)))
    (if (= w1-len min-len)
      (set! w w1)
      (set! wlst (string->list w2)))
      (set! w w2)
      (set! wlst (string->list w1))))
    (for i in (range 0 min-len)
     (set! c (string-ref w i))
     (if (= (find found c) -1)
           (if (>= (find wlst c) 0)
             (set! count (add1 count))
             (set! found (append found (list c))))))))

;; Returns a list of words that are phonetically most
;; similar to "word". We use a simple distance calculation
;; algorithm to find these.
(define (find-most-similar-words word words)
  (let ((a (string-length word)) (ret (list)))
    (while (not (null? words))
       (let* ((c (count-shared-chars (car words) word))
          (w (car words))
          (b (string-length w))
          (qs (/ (* 2 c) (+ a b))))
         (if (>= qs match-range)
         (set! ret (append ret (list (cons w qs)))))
         (set! words (cdr words))))
    (set! ret (sort ret (lambda (a b) (> (cdr a) (cdr b)))))

(define (remove-qs lst)
  (let ((ret (list)))
    (while (not (null? lst))
       (set! ret (append ret (list (car (car lst)))))
       (set! lst (cdr lst)))

;; Main program starts here. Create the soundex database if it is not there:
(printf "Enter words to spellcheck. Terminate the session by typing QUIT.~n")

(define word null)

;; Read words from the user, give spelling suggestions if he inputs a wrong word.
;; This loop can be terminated by typing "QUIT".
(while #t
       (printf "> ")
       (set! word (read))
       (if (not (eq? word 'QUIT))
         (let ((s (assoc (soundex (symbol->string word) #f)
           (if (eq? s #f)
           (printf "NO SUGGESTIONS~n")
             (let ((words (cdr s)))
               (if (not (find-if words (lambda (i) (string=? i (symbol->string word)))))
                 (if show-all-suggestions
                 (printf "~a~n" words)
                 (printf "~a~n" (remove-qs (find-most-similar-words
                                    (symbol->string word) words)))))))))))

Leave a Reply

Please log in using one of these methods to post your comment: Logo

You are commenting using your account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: