├── .gitignore
├── nltk
    ├── graphs
    │   └── inaugural-dispersion.jpg
    ├── package.lisp
    ├── ch1-2.lisp
    ├── ch1-1.lisp
    ├── ch1-2.md
    └── ch1-1.md
├── data
    └── abbrevs-with-dot.txt
├── README.md
├── src
    ├── contrib
    │   ├── packages.lisp
    │   └── ms-ngrams.lisp
    ├── test-util.lisp
    ├── corpora
    │   ├── corpus.lisp
    │   ├── nps-chat.lisp
    │   └── brown.lisp
    ├── core
    │   ├── measures.lisp
    │   ├── language-models.lisp
    │   ├── indexing.lisp
    │   ├── tokenization.lisp
    │   └── ngrams.lisp
    ├── user.lisp
    ├── packages.lisp
    ├── generation
    │   └── markov-chain.lisp
    └── util.lisp
├── LICENSE
├── cl-nltk.asd
└── cl-nlp.asd


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.fasl
3 | \#*
4 | .*
5 | !.gitignore
6 | 


--------------------------------------------------------------------------------
/nltk/graphs/inaugural-dispersion.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/html/cl-nlp/master/nltk/graphs/inaugural-dispersion.jpg


--------------------------------------------------------------------------------
/nltk/package.lisp:
--------------------------------------------------------------------------------
1 | ;;; (c) 2013 Vsevolod Dyomkin
2 | 
3 | (cl:defpackage #:nltk
4 |   (:use #:common-lisp #:rutil #:nlp-user))
5 | 


--------------------------------------------------------------------------------
/data/abbrevs-with-dot.txt:
--------------------------------------------------------------------------------
 1 | e.g.
 2 | i.e.
 3 | Mr.
 4 | Ms.
 5 | Mrs.
 6 | Dr.
 7 | St.
 8 | Rd.
 9 | Ave.
10 | A.D.
11 | B.C.
12 | Mt.
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CL-NLP — Lisp NLP tool-set
2 | 
3 | Caution: this is vaporware! Come back in 2014, unless you want to contribute...
4 | 
5 | ...but if you want to contribute you're very welcome.
6 | Feel free to write to <vseloved@gmail.com>
7 | 


--------------------------------------------------------------------------------
/src/contrib/packages.lisp:
--------------------------------------------------------------------------------
1 | ;;; (c) 2013 Vsevolod Dyomkin
2 | 
3 | (cl:defpackage #:nlp.contib.ngrams
4 |   (:use #:common-lisp #:rutil #:nlp)
5 |   (:export #:ms-ngrams
6 |            #:ms-ngrams-url
7 |            #:ms-ngrams-user-token
8 |            #:ms-ngrams-catalog))


--------------------------------------------------------------------------------
/src/test-util.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nlp.test-util)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | 
 7 | (defun test-file (filename)
 8 |   (merge-pathnames (strcat "test/" filename) +project-root+))
 9 | 
10 | 
11 | (defun test-on-corpus (func corpus expected &key (test 'equal))
12 |   (let ((tp 0)
13 |         (fp 0))
14 |   ))
15 | 
16 | #+nil
17 | (test-on-corpus #`(tokenize (make 'regex-word-tokenizer) %)
18 |                 +brown-corpus+
19 |                 (test-file "brown-tokenization.txt"))


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2013 Vsevolod Dyomkin
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/cl-nltk.asd:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:asdf)
 4 | 
 5 | (defsystem #:cl-nltk
 6 |   :version "0.0.2"
 7 |   :description "Implementation of the examples from the NLTK book."
 8 |   :author "Vsevolod Dyomkin <vseloved@gmail.com>"
 9 |   :maintainer "Vsevolod Dyomkin <vseloved@gmail.com>"
10 |   :license "Apache 2.0"
11 |   :depends-on (#:rutils #:cl-ppcre #:cl-nlp #:cgn)
12 |   :serial t
13 |   :components
14 |   ((:module #:nltk
15 |             :serial t
16 |             :components
17 |             ((:file "package")
18 |              (:file "ch1-1")
19 |              (:file "ch1-2")))))
20 | 


--------------------------------------------------------------------------------
/src/corpora/corpus.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nlp.corpora)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | 
 7 | (defstruct corpus
 8 |   "A structure to cache a unilingual corpus in raw and processed forms."
 9 |   name
10 |   lang
11 |   raw-texts
12 |   clean-texts
13 |   text-tokens)
14 | 
15 | (defstruct token
16 |   "A corpus token with postition and possibly tag."
17 |   beg
18 |   end
19 |   word
20 |   tag)
21 | 
22 | 
23 | (defgeneric read-corpus (type file)
24 |   (:documentation
25 |    "Read corpus data of a certain TYPE (a keyword) from file.
26 |     Returns as values:
27 |     - raw text data
28 |     - cleaned-up text
29 |     - list of tokens from the text"))


--------------------------------------------------------------------------------
/src/core/measures.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nlp.core)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | 
 7 | (declaim (inline log2))
 8 | (defun log2 (x)
 9 |   "Base 2 logarithm."
10 |   (/ (log x) (log 2)))
11 | 
12 | (defun entropy (samples &optional total)
13 |   "Compute Shannon's entropy of SAMPLES list.
14 |    To save on calculation a pre-calculated TOTAL can be provided."
15 |   (unless total
16 |     (setf total (reduce #'+ samples)))
17 |   (reduce #'+ (mapcar #`(if (zerop %)
18 |                             0
19 |                             (let ((r (/ % total)))
20 |                               (* r (log2 r))))
21 |                       samples)))
22 | 
23 | (defun log-likelihood-ratio (ab a~b ~ab ~a~b)
24 |   "Calculate log-likelihood ratio between event A and B given
25 |    probabilites of A and B occurring together and separately."
26 |   (let ((total (+ ab a~b ~ab ~a~b)))
27 |     (* 2 total (- (entropy (list ab a~b ~ab ~a~b) total)
28 |                   (entropy (list (+ ab a~b) (+ ~ab ~a~b)) total)
29 |                   (entropy (list (+ ab ~ab) (+ a~b ~a~b)) total)))))


--------------------------------------------------------------------------------
/src/user.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nlp-user)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | 
 7 | (defun print-word-in-contexts (word string &key (width 25) pass-newlines limit)
 8 |   "Print all (or LIMIT) WORD occurances in STRING with the surrounding
 9 |    context up to WIDTH chars. If PASS-NEWLINES isn't set, the context
10 |    will be shown up to the closest newline."
11 |   (declare (type (integer 0) width))
12 |   (let* ((regex (re:create-scanner (fmt "\\b~A\\b" word)
13 |                                    :case-insensitive-mode t))
14 |          (matches (re:all-matches regex string))
15 |          (len (/ (length matches) 2)))
16 |     (format t "Displaying ~A of ~A matches~%" (if limit (min limit len) len) len)
17 |     (loop :for (s e) :on matches :by #'cddr :do
18 |        (let ((s- (- s width))
19 |              (e+ (+ e width)))
20 |          (if pass-newlines
21 |              (format t "~A~%" (substitute-if #\Space #'white-char-p
22 |                                              (subseq string s- e+)))
23 |              (let ((l-pos (max s- (1+ (position #\Newline string
24 |                                                 :end s :from-end t))))
25 |                    (r-pos (min e+ (1- (position #\Newline string :start e)))))
26 |                (format t "~@[~A~]~A~@[~A~]~%"
27 |                        (unless (= s- l-pos) (filler (- l-pos s-)))
28 |                        (subseq string l-pos r-pos)
29 |                        (unless (= e+ r-pos) (filler (- e+ r-pos))))))))))
30 | 


--------------------------------------------------------------------------------
/cl-nlp.asd:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:asdf)
 4 | 
 5 | (defsystem #:cl-nlp
 6 |   :version "0.0.3"
 7 |   :description "NLP toolkit for Common Lisp."
 8 |   :author "Vsevolod Dyomkin <vseloved@gmail.com>"
 9 |   :maintainer "Vsevolod Dyomkin <vseloved@gmail.com>"
10 |   :license "Apache 2.0"
11 |   :depends-on (#:rutils #:cl-fad #:cl-ppcre #:cxml)
12 |   :serial t
13 |   :components
14 |   ((:module #:src
15 |             :serial t
16 |             :components
17 |             ((:file "packages")
18 |              (:file "util")
19 |              (:module #:corpora
20 |                       :serial t
21 |                       :components
22 |                       ((:file "corpus")
23 |                        (:file "brown")
24 |                        (:file "nps-chat")))
25 |              (:file "test-util")
26 |              (:module #:core
27 |                       :serial t
28 |                       :components
29 |                       ((:file "measures")
30 |                        (:file "tokenization")
31 |                        (:file "ngrams")
32 |                        (:file "language-models")
33 |                        (:file "indexing")))
34 |              (:module #:generation
35 |                       :serial t
36 |                       :components
37 |                       ((:file "markov-chain")))
38 |              (:file "user")))))
39 | 
40 | (defsystem #:cl-nlp.contib
41 |   :version "0.0.1"
42 |   :description "CL-NLP additional packages."
43 |   :author "Vsevolod Dyomkin <vseloved@gmail.com>"
44 |   :maintainer "Vsevolod Dyomkin <vseloved@gmail.com>"
45 |   :license "Apache 2.0"
46 |   :depends-on (#:cl-nlp #:drakma)
47 |   :serial t
48 |   :components
49 |   ((:module #:src
50 |             :components
51 |             ((:module #:contrib
52 |                       :serial t
53 |                       :components
54 |                       ((:file "packages")
55 |                        (:file "ms-ngrams")))))))


--------------------------------------------------------------------------------
/src/corpora/nps-chat.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nlp.corpora)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | 
 7 | (defmethod read-corpus-file ((type (eql :nps-chat)) file)
 8 |   "Read individual file from the NPS Chat Corpus."
 9 |   (cxml:parse-file file (make 'nps-chat-sax)))
10 | 
11 | #+manually
12 | (defparameter +nps-chat-corpus+
13 |   (let ((corpus (make-corpus :name "NPS Chat Corpus" :lang :en-us)))
14 |     (fad:walk-directory
15 |      (merge-pathnames "corpora/nps_chat/" +project-root+)
16 |      #`(when (string= "xml" (pathname-type %))
17 |          (mv-bind (text tokens) (read-corpus-file :nps-chat %)
18 |            (push (mapcar #'car text) (corpus-raw-texts corpus))
19 |            (push tokens (corpus-text-tokens corpus)))))
20 |     corpus)
21 |   "NPS Chat Corpus, Release 1.0 (July 2008).")
22 | 
23 | 
24 | ;; sax parsing
25 | 
26 | (defun attr (name attributes)
27 |   (sax::standard-attribute-value
28 |    (find name attributes :test 'string=
29 |          :key #'sax::standard-attribute-local-name)))
30 | 
31 | (defclass nps-chat-sax (sax:sax-parser-mixin)
32 |   ((posts :initform nil)
33 |    (tokens :initform nil)))
34 | 
35 | (defmethod sax:start-element ((sax nps-chat-sax)
36 |                               namespace-uri local-name qname attributes)
37 |   (with-slots (posts tokens) sax
38 |     (case (mkeyw local-name)
39 |       (:post (push (attr "class" attributes) posts))
40 |       (:terminals (push (list nil) tokens))
41 |       (:t (push (make-token :word (attr "word" attributes)
42 |                             :tag (mkeyw (attr "pos" attributes)))
43 |                 (car tokens))))))
44 | 
45 | (defmethod sax:characters ((sax nps-chat-sax) data)
46 |   (with-slots (posts) sax
47 |     (when (stringp (car posts))
48 |       (setf (car posts) (cons data (car posts))))))
49 | 
50 | (defmethod sax:end-element ((sax nps-chat-sax) namespace-uri local-name qname)
51 |   ;; to avoid warnings
52 |   )
53 | 
54 | (defmethod sax:end-document ((sax nps-chat-sax))
55 |   (values (reverse (slot-value sax 'posts))
56 |           (reverse (mapcar #`(rest (reverse %))
57 |                            (slot-value sax 'tokens)))))


--------------------------------------------------------------------------------
/src/contrib/ms-ngrams.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nlp.contrib.ngrams)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | 
 7 | ;;; Microsoft ngrams
 8 | 
 9 | (defclass ms-ngrams (ngrams)
10 |   ((count :initform -1)  ; special value to inidicate that we don't know it :)
11 |    (url :initarg :url :accessor ms-ngrams-url
12 |         :initform "http://web-ngram.research.microsoft.com/rest/lookup.svc")
13 |    (user-token :initarg :user-token :accessor ms-ngrams-user-token)
14 |    (catalog :initarg :catalog :initform "bing-body/apr10"
15 |             :accessor ms-ngrams-catalog))
16 |   (:documentation
17 |    "Frontend to Microsoft Ngrams service.
18 |     See http://web-ngram.research.microsoft.com/info/"))
19 | 
20 | (flet ((to-string (ngram)
21 |          "If NGRAM is a list, convert it to string."
22 |          (if (listp ngram) (strjoin " " ngram) ngram)))
23 | 
24 | (macrolet ((query-ngrams (op)
25 |              `(with-slots (url user-token catalog order) ms-ngrams
26 |                 (let ((*read-eval* nil))
27 |                   (read-from-string
28 |                    (drakma:http-request
29 |                     (fmt "~A/~A/~A/~A?u=~A&p=~A"
30 |                          url catalog order ,op user-token
31 |                          (to-string ngram))))))))
32 | 
33 | (defmethod logprob ((ngrams ms-ngrams) ngram)
34 |   (query-ngrams "jp"))
35 | 
36 | (defmethod cond-logprob ((ngrams ms-ngrams) ngram)
37 |   (query-ngrams "cp"))
38 | 
39 | ) ; end of marolet
40 | 
41 | (macrolet ((query-ngrams (op)
42 |              `(with-slots (url user-token catalog order) ms-ngrams
43 |                 (let ((*read-eval* nil))
44 |                   (mapcar #'read-from-string
45 |                           (split-sequence
46 |                            #/Newline (drakma:http-request
47 |                                       (fmt "~A/~A/~A/~A?u=~A"
48 |                                            url catalog order ,op user-token)
49 |                                       :method :post
50 |                                       :content (fmt "~{~A~%~}"
51 |                                                     (mapcar #'to-string
52 |                                                             ngrams-list)))))))))
53 | 
54 | (defmethod logprobs ((ngrams ms-ngrams) &rest ngrams-list)
55 |   (query-ngrams "jp"))
56 | 
57 | (defmethod cond-logprobs ((ngrams ms-ngrams) &rest ngrams-list)
58 |   (query-ngrams "cp"))
59 | 
60 | ) ; end of marolet
61 | ) ; end of flet
62 | 
63 | 
64 | (defmethod ngrams-eq ((ngrams ms-ngrams))
65 |   #'equalp)


--------------------------------------------------------------------------------
/nltk/ch1-2.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nltk)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | (defclass text ()
 7 |   ((name :initarg :name)
 8 |    (raw :initarg :raw :accessor text-raw)
 9 |    (words :accessor text-words)
10 |    (ctxs :accessor text-ctxs)
11 |    (transitions :accessor text-transitions)
12 |    (dispersion :accessor text-dispersion)
13 |    (ugrams :accessor text-ugrams)
14 |    (bigrams :accessor text-bigrams)
15 |    (trigrams :accessor text-trigrams)))
16 | 
17 | (defmethod slot-unbound (class (obj text) (slot (eql 'ugrams)))
18 |   (with-slots (words ugrams) obj
19 |     (format t "~&Indexing unigrams...~%")
20 |     (prog1 (setf ugrams (index-ngrams 1 words))
21 |       (format t "Number of ugrams: ~A~%" (ngrams-count ugrams)))))
22 | 
23 | (defmethod slot-unbound (class (obj text) (slot (eql 'bigrams)))
24 |   (with-slots (words bigrams) obj
25 |     (format t "~&Indexing bigrams...~%")
26 |     (prog1 (setf bigrams (index-ngrams 2 words))
27 |       (format t "Number of bigrams: ~A~%" (ngrams-count bigrams)))))
28 | 
29 | (defmethod slot-unbound (class (obj text) (slot (eql 'trigrams)))
30 |   (with-slots (words trigrams) obj
31 |     (format t "~&Indexing trigrams...~%")
32 |     (prog1 (setf trigrams (index-ngrams 3 words))
33 |       (format t "Number of trigrams: ~A~%" (ngrams-count trigrams)))))
34 | 
35 | (defun collocations (text)
36 |   (find-collocations (text-bigrams text) :n 30))
37 | 
38 | (defun generate (text &key (n 20) (order 2))
39 |   "Generate random text of N words, based on TEXT."
40 |   (with-slots (transitions) text
41 |     (string-trim (append +white-chars+ +newline-chars+)
42 |                  (fmt "~{~A ~}"
43 |                       (generate-text (make 'markov-chain-generator :order order)
44 |                                      (make-lm 'stupid-backoff-lm
45 |                                               :1g (text-ugrams text)
46 |                                               :2g (when (> order 1)
47 |                                                     (text-bigrams text))
48 |                                               :3g (when (> order 2)
49 |                                                     (text-trigrams text)))
50 |                                      n)))))
51 | 
52 | ;; Plotting
53 | 
54 | (defun dump-counts (ngrams n order-by cumulative)
55 |   "Dump N NGRAMS counts (or CUMULATIVE counts) orderd by ORDER-BY."
56 |   (let ((filename (fmt "/tmp/~A" (gensym)))
57 |         (total 0))
58 |     (with-out-file (out filename)
59 |       (doindex (i pair (ngrams-pairs ngrams :order-by order-by))
60 |         (when (and n (> i n))
61 |           (return))
62 |         (format out "~A~t~S~t~A~%" (1+ i) (car pair)
63 |                 (if cumulative
64 |                     (incf total (cdr pair))
65 |                     (cdr pair))))
66 |     filename)))
67 | 
68 | (defun plot (ngrams &key n (order-by '>) cumulative)
69 |   "Plot NGRAMS counts."
70 |   (cgn:with-gnuplot (t)
71 |     (cgn:format-gnuplot "set xtics rotate 90")
72 |     (cgn:format-gnuplot "set ylabel \"~@[Cumulative ~]Counts\"" cumulative)
73 |     (cgn:format-gnuplot
74 |      "plot \"~A\" using 1:3:xtic(2) with lines title \"\""
75 |      (dump-cumulative-counts ngrams n order-by cumulative))))


--------------------------------------------------------------------------------
/src/corpora/brown.lisp:
--------------------------------------------------------------------------------
 1 | ;;; (c) 2013 Vsevolod Dyomkin
 2 | 
 3 | (in-package #:nlp.corpora)
 4 | (named-readtables:in-readtable rutils-readtable)
 5 | 
 6 | 
 7 | (defmethod read-corpus-file ((type (eql :brown)) file)
 8 |   "Read individual file from the Brown corpus."
 9 |   (let ((text (string-trim +white-chars+ (read-file file)))
10 |         (offset 0)
11 |         tokens)
12 |     (loop :for (beg end) :on (re:all-matches "[^\\s]+" text) :by #'cddr :do
13 |        (let* ((tok (subseq text beg end))
14 |               (/-pos (position #\/ tok :from-end t))
15 |               (word (subseq tok 0 /-pos))
16 |               (tag (subseq tok (1+ /-pos))))
17 |          (when (member word '("``" "''") :test #'string=)
18 |            (setf word "\""
19 |                  end (1- end)))
20 |          (push (make-token :beg (- beg offset)
21 |                            :end (- end (incf offset (1+ (length tag))))
22 |                            :word word :tag tag)
23 |                tokens)))
24 |     (values text
25 |             (let ((clean-text (make-string (token-end (first tokens))
26 |                                            :initial-element #\Space)))
27 |               (dolist (token (reverse tokens) clean-text)
28 |                 (with-slots (beg end word) token
29 |                   (setf (subseq clean-text beg end) word))))
30 |             (reverse tokens))))
31 | 
32 | #+manually
33 | (defparameter +brown-corpus+
34 |   (let ((brown-topics-mapping #{#\a :press-reportage
35 |                                 #\b :press-editorial
36 |                                 #\c :press-reviews
37 |                                 #\d :religion
38 |                                 #\e :skill-and-hobbies
39 |                                 #\f :popular-lore
40 |                                 #\g :belles-lettres
41 |                                 #\h :miscellaneous-government-house-organs
42 |                                 #\j :learned
43 |                                 #\k :fiction-general
44 |                                 #\l :fiction-mystery
45 |                                 #\m :fiction-science
46 |                                 #\n :fiction-adventure
47 |                                 #\p :fiction-romance
48 |                                 #\r :humor
49 |                                })
50 |         (raw-texts #{})
51 |         (clean-texts #{})
52 |         (text-tokens #{})
53 |         (corpus #{}))
54 |     (dolist (path (directory (merge-pathnames "corpora/brown/*" +project-root+)))
55 |       (when (= 4 (length (pathname-name path)))
56 |         (let ((topic (get# (char (pathname-name path) 1) brown-topics-mapping)))
57 |           (unless (get# topic raw-texts)
58 |             (set# topic raw-texts '(nil))
59 |             (set# topic clean-texts '(nil))
60 |             (set# topic text-tokens '(nil)))
61 |           (mv-bind (raw text tokens) (read-corpus-file :brown path)
62 |             (push raw (get# topic raw-texts))
63 |             (push text (get# topic clean-texts))
64 |             (push tokens (get# topic text-tokens))))))
65 |     (dolist (topic (ht-keys raw-texts))
66 |       (set# topic corpus
67 |             (make-corpus
68 |              :name (fmt "Brown Corpus - ~A" topic)
69 |              :lang :en-us
70 |              :raw-texts (rest (reverse (get# topic raw-texts)))
71 |              :clean-texts (rest (reverse (get# topic clean-texts)))
72 |              :text-tokens (rest (reverse (get# topic text-tokens))))))
73 |     corpus)
74 |   "Brown University Standard Corpus of Present-Day American English.")


--------------------------------------------------------------------------------
/src/packages.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (cl:defpackage #:nlp.util
  4 |   (:nicknames #:nutil)
  5 |   (:use #:common-lisp #:rutil)
  6 |   (:export #:cl-nlp-error
  7 |            #:not-implemented-error
  8 | 
  9 |            #:+newline+
 10 |            #:+newline-chars+
 11 |            #:newline-char-p
 12 |            #:+white-chars+
 13 |            #:white-char-p
 14 |            #:+period-chars+
 15 |            #:period-char-p
 16 | 
 17 |            #:*stopwords-en*
 18 |            #:ending-word-p
 19 | 
 20 |            #:filler
 21 | 
 22 |            #:+project-root+
 23 |            #:data-file
 24 |            #:list-from-file
 25 | 
 26 |            #:bin-search
 27 | 
 28 |            #:define-lazy-singleton
 29 |            #:sorted-ht-keys
 30 |            ))
 31 | 
 32 | (cl:defpackage #:nlp.corpora
 33 |   (:nicknames #:ncorp)
 34 |   (:use #:common-lisp #:rutil #:nlp.util)
 35 |   (:export #:corpus
 36 |            #:make-corpus
 37 | 
 38 |            #:corpus-name
 39 |            #:corpus-lang
 40 |            #:corpus-raw-texts
 41 |            #:corpus-clean-texts
 42 |            #:corpus-text-tokens
 43 | 
 44 |            #:read-corpus
 45 |            #:read-corpus-file
 46 | 
 47 |            #:token
 48 |            #:token-word
 49 |            #:token-beg
 50 |            #:token-end
 51 |            #:token-tag
 52 | 
 53 |            #:+brown-corpus+
 54 |            #:+nps-chat-corpus+
 55 |            ))
 56 | 
 57 | (cl:defpackage #:nlp.test-util
 58 |   (:nicknames #:ntest)
 59 |   (:use #:common-lisp #:rutil #:nlp.util #:nlp.corpora)
 60 |   (:export ))
 61 | 
 62 | (cl:defpackage #:nlp.core
 63 |   (:nicknames #:ncore)
 64 |   (:use #:common-lisp #:rutil #:nlp.util)
 65 |   (:export #:ngrams
 66 |            #:ngrams-eq
 67 |            #:ngrams-order
 68 |            #:ngrams-count
 69 |            #:ngrams-total-freq
 70 |            #:ngrams-max-freq
 71 |            #:ngrams-min-freq
 72 |            #:ngrams-pairs
 73 |            #:vocab
 74 |            #:freq
 75 |            #:prob
 76 |            #:logprob
 77 |            #:cond-prob
 78 |            #:cond-logprob
 79 |            #:freqs
 80 |            #:probs
 81 |            #:logprobs
 82 |            #:cond-probs
 83 |            #:cond-logprobs
 84 |            #:top-ngram
 85 |            #:hapaxes
 86 |            #:table-ngrams
 87 |            #:ngrams-table
 88 | 
 89 |            #:language-model
 90 |            #:lm-order
 91 |            #:lm-ngrams
 92 |            #:make-lm
 93 |            #:perplexity
 94 |            #:stupid-backoff-lm
 95 |            #:lm-backoff
 96 | 
 97 |            #:index-ngrams
 98 |            #:index-context-freqs
 99 |            #:index-prefix-transition-freqs
100 |            #:index-word-transition-freqs
101 |            #:normalize-freqs
102 | 
103 |            #:tokenize
104 |            ;; #:stream-tokenize
105 |            #:tokenizer
106 |            #:regex-word-tokenizer
107 |            #:baseline-sentence-tokenizer
108 |            #:<word-tokenizer>
109 |            #:<word-chnuker>
110 |            #:<sentence-tokenizer>
111 | 
112 |            #:doublenewline-paragraph-splitter
113 |            #:<paragraph-splitter>
114 | 
115 |            #:find-collocations
116 |            ))
117 | 
118 | ;; (cl:defpackage #:nlp.phonetics
119 | ;;   (:nicknames #:npho)
120 | ;;   (:use #:common-lisp #:rutil)
121 | ;;   (:export #:phonetic-transform
122 | ;;            ))
123 | 
124 | ;; (cl:defpackage #:nlp.syntax
125 | ;;   (:nicknames #:nsyn)
126 | ;;   (:use #:common-lisp #:rutil)
127 | ;;   (:export #:pos-tag
128 | ;;            #:parse
129 | ;;            #:parse-deps
130 | ;;            ))
131 | 
132 | (cl:defpackage #:nlp.generation
133 |   (:nicknames #:ngen)
134 |   (:use #:common-lisp #:rutil #:nlp.util #:nlp.core)
135 |   (:export #:generate-text
136 | 
137 |            #:text-generator
138 | 
139 |            #:markov-chain-generator
140 |            #:mark-v-shaney-generator
141 |            #:<mark-v-shaney>
142 |            #:markov-order
143 |            ))
144 | 
145 | ;; (cl:defpackage #:nlp.learning
146 | ;;   (:nicknames #:nlearn)
147 | ;;   (:use #:common-lisp #:rutil)
148 | ;;   (:export #:cluster
149 | ;;            #:classify
150 | ;;            #:train
151 | ;;            ))
152 | 
153 | 
154 | (cl:defpackage #:nlp-user
155 |   (:nicknames #:nlp)
156 |   (:use #:common-lisp #:rutil
157 |         #:nlp.util #:nlp.corpora #:nlp.core #:nlp.generation))
158 | 
159 | (re-export-symbols '#:nutil '#:nlp-user)
160 | (re-export-symbols '#:ncore '#:nlp-user)
161 | (re-export-symbols '#:ncorp '#:nlp-user)
162 | (re-export-symbols '#:ngen  '#:nlp-user)


--------------------------------------------------------------------------------
/src/generation/markov-chain.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (in-package #:nlp.generation)
  4 | (named-readtables:in-readtable rutils-readtable)
  5 | 
  6 | 
  7 | (defgeneric generate-text (generator data length &key)
  8 |   (:documentation
  9 |    "Generate random text of LENGTH words based on some DATA
 10 |     (usually, table of transition probabilities between tokens).
 11 |     May not return period at the end.
 12 |    "))
 13 | 
 14 | 
 15 | (defclass text-generator ()
 16 |   ()
 17 |   (:documentation
 18 |    "Base class for text generators."))
 19 | 
 20 | (defclass markov-chain-generator (text-generator)
 21 |   ((order :accessor markov-order :initarg :order))
 22 |   (:documentation
 23 |    "Markov chain generator of the given ORDER."))
 24 | 
 25 | (defclass mark-v-shaney-generator (markov-chain-generator)
 26 |   ((order :reader markov-order :initform 2))
 27 |   (:documentation
 28 |    "Markov chain generator of the 1st order — it is defined, because:
 29 |     - this is the general and most useful case
 30 |     - it allows to use optimized data-structures and a simpler algorithm
 31 |     - the name is well-known"))
 32 | 
 33 | (defmethod generate-text ((generator markov-chain-generator)
 34 |                           (transitions hash-table)
 35 |                           length &key skip-paragraphs &allow-other-keys)
 36 |   "Generate text of LENGTH with a markov model of some MARKOV-ORDER described
 37 |    by the table TRANSITIONS of transition probabilities between reverse prefixes
 38 |    of MARKOV-ORDER length and words.
 39 |    Unless SKIP-PARAGRAPHS is set, the text may include newlines."
 40 |   (let* ((order (markov-order generator))
 41 |          (initial-prefix (if (> order 1)
 42 |                              (cons "¶" (make-list (1- order)))
 43 |                              (list "¶")))
 44 |          (prefix initial-prefix)
 45 |          rez)
 46 |     (loop :for i :from 1 :to length :do
 47 |       (let ((r (random 1.0)))
 48 |         (dotable (word prob
 49 |                   (or (get# prefix transitions)
 50 |                       ;; no continuation - start anew
 51 |                       (prog1 (get# (setf prefix initial-prefix) transitions)
 52 |                         ;; add period unless one is already there
 53 |                         (unless (every #'period-char-p (car rez))
 54 |                           (push "." rez)
 55 |                           (incf i)))))
 56 |           (when (<= (decf r prob) 0)
 57 |             (if (string= "¶" word)
 58 |                 (if skip-paragraphs
 59 |                     (decf i)  ; don't count newline if we don't include it
 60 |                     (push +newline+ rez))
 61 |                 (push word rez))
 62 |             (setf prefix (cons word (butlast prefix)))
 63 |             (return)))))
 64 |     (reverse rez)))
 65 | 
 66 | (defmethod generate-text ((generator markov-chain-generator)
 67 |                           (model language-model)
 68 |                           length &key &allow-other-keys)
 69 |   "Generate text of LENGTH with a markov model of some MARKOV-ORDER
 70 |    with the given language MODEL.
 71 |    May not return period at the end."
 72 |   (assert (<= (markov-order generator) (lm-order model)))
 73 |   (let* ((order (markov-order generator))
 74 |          (vocab (vocab model))
 75 |          (len (length vocab))
 76 |          (ngram (list "<S>"))
 77 |          rez)
 78 |     (loop :for i :from 1 :to length :do
 79 |        (when (= (length ngram) order)
 80 |          (setf ngram (rest ngram)))
 81 |        (let ((total 0)
 82 |              (cond-probs (list (cons "<S>" 0))))
 83 |          (dolist (word vocab)
 84 |            (unless (string= "<S>" word)
 85 |              (push (cons word
 86 |                          (incf total (cond-prob model (append ngram
 87 |                                                               (list word)))))
 88 |                    cond-probs)))
 89 |          (let ((word (car (bin-search (random total)
 90 |                                       (make-array len
 91 |                                                   :initial-contents cond-probs)
 92 |                                       #'> :key #'cdr))))
 93 |            (if (string= "</S>" word)
 94 |                (progn (if (every #'period-char-p (car rez))
 95 |                           (unless (= i length)
 96 |                             (decf i))  ; just skip
 97 |                           (push "." rez))
 98 |                       (setf ngram (list "<S>")))
 99 |                (setf rez (cons word rez)
100 |                      ngram (append ngram (list word)))))))
101 |     (reverse rez)))
102 | 
103 | 
104 | (define-lazy-singleton mark-v-shaney (make 'markov-chain-generator :order 2)
105 |   "The infamous Mark V. Shaney.")


--------------------------------------------------------------------------------
/src/util.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (in-package #:nlp.util)
  4 | (named-readtables:in-readtable rutils-readtable)
  5 | 
  6 | 
  7 | ;;; Some package renaming
  8 | 
  9 | (rename-package "CL-PPCRE" "CL-PPCRE" '("PPCRE" "RE"))
 10 | 
 11 | 
 12 | ;;; Conditions
 13 | 
 14 | (define-condition nlp-error (simple-error) ())
 15 | 
 16 | (define-condition not-implemented-error (simple-error) ())
 17 | 
 18 | 
 19 | ;;; Working with chars
 20 | 
 21 | (defparameter +newline+
 22 |   (fmt "~%")
 23 |   "A string with a single newline.")
 24 | 
 25 | (defparameter +white-chars+
 26 |   '(#\Space #\Tab #\Newline #\Return #\Linefeed)
 27 |   "Chars considered WHITESPACE.")
 28 | 
 29 | (defun white-char-p (char)
 30 |   "Test if CHAR is in +WHITE-CHARS+."
 31 |   (member char +white-chars+))
 32 | 
 33 | (defparameter +period-chars+
 34 |   '(#\. #\? #\!)
 35 |   "Chars considered legitimate sentence endings.")
 36 | 
 37 | (defun period-char-p (char)
 38 |   "Test if CHAR is in +PERIOD-CHARS+."
 39 |   (member char +period-chars+))
 40 | 
 41 | (defparameter +newline-chars+
 42 |   '(#\Newline #\Return #\Linefeed)
 43 |   "Chars considered legitimate paragraph endings.")
 44 | 
 45 | (defun newline-char-p (char)
 46 |   "Test if CHAR is in +PERIOD-CHARS+."
 47 |   (member char +newline-chars+))
 48 | 
 49 | (declaim (inline filler))
 50 | (defun filler (n &optional (fill-char #\Space))
 51 |   "Produce an N-element filler string of FILL-CHAR's."
 52 |   (make-string n :initial-element fill-char))
 53 | 
 54 | 
 55 | ;;; Working wtih words
 56 | 
 57 | (defparameter *stopwords-en*
 58 |   '("!" "\"" "'" "," "-" "." ":" ";" "</S>" "<S>" "?" "a" "about" "above" "after" "again" "against" "all" "am" "an" "and" "any" "are" "aren't" "as" "at" "be" "because" "been" "before" "being" "below" "between" "both" "but" "by" "can't" "cannot" "could" "couldn't" "d" "did" "didn't" "do" "does" "doesn't" "doing" "don't" "down" "during" "each" "few" "for" "from" "further" "had" "hadn't" "has" "hasn't" "have" "haven't" "having" "he" "he'd" "he'll" "he's" "her" "here" "here's" "hers" "herself" "him" "himself" "his" "how" "how's" "i" "i'd" "i'll" "i'm" "i've" "if" "in" "into" "is" "isn't" "it" "it's" "its" "itself" "let" "let's" "ll" "me" "more" "most" "mustn't" "my" "myself" "n't" "no" "nor" "not" "of" "off" "on" "once" "only" "or" "other" "ought" "our" "ours " "ourselves" "out" "over" "own" "s" "same" "shan't" "she" "she'd" "she'll" "she's" "should" "shouldn't" "so" "some" "such" "t" "than" "that" "that's" "the" "their" "theirs" "them" "themselves" "then" "there" "there's" "these" "they" "they'd" "they'll" "they're" "they've" "this" "those" "through" "to" "too" "under" "until" "up" "very" "was" "wasn't" "we" "we'd" "we'll" "we're" "we've" "were" "weren't" "what" "what's" "when" "when's" "where" "where's" "which" "while" "who" "who's" "whom" "why" "why's" "with" "won't" "would" "wouldn't" "you" "you'd" "you'll" "you're" "you've" "your" "yours" "yourself" "yourselves")
 59 |   "List of english stopwords.")
 60 | 
 61 | (defun ending-word-p (word)
 62 |   "Check if string WORD is some kind of a period char or a paragraph mark."
 63 |   (or (every #'period-char-p word)
 64 |       (string= "¶" word)))
 65 | 
 66 | 
 67 | ;;; Working with project files
 68 | 
 69 | (eval-always
 70 |   (defparameter +project-root+ (asdf:system-relative-pathname 'cl-nlp "")
 71 |     "Base dir of cl-nlp project."))
 72 | 
 73 | (defun data-file (filename)
 74 |   "File in data/ subdir of cl-nlp."
 75 |   (merge-pathnames (strcat "data/" filename)
 76 |                    +project-root+))
 77 | 
 78 | (defun list-from-file (file)
 79 |   "Load the contents of FILE into a list of strings for each trimmed line."
 80 |   (let (rez)
 81 |     (dolines (line file)
 82 |       (push (string-trim +white-chars+ line) rez))
 83 |     (reverse rez)))
 84 | 
 85 | 
 86 | ;;; Search
 87 | 
 88 | (defun bin-search (val vec test-less &key (start 0) end key test)
 89 |   "Binary search for VAL in sorted vector VEC (the order property isn't checked).
 90 |    Needs to specify TEST-LESS predicate. Handles START, END, KEY as usual.
 91 |    It TEST is provided tests the value at the found position against VAL,
 92 |    and returns nil if it returns nil."
 93 |   (let ((low start)
 94 |         (high (or end (1- (length vec)))))
 95 |     (do ()
 96 |         ((= low high) (when (or (null test)
 97 |                                 (funcall test val (svref vec high)))
 98 |                         (elt vec high)))
 99 |       (let ((mid (floor (+ low high) 2)))
100 |         (if (funcall test-less (if key
101 |                                    (funcall key (svref vec mid))
102 |                                    (svref vec mid))
103 |                      val)
104 |             (setf low (1+ mid))
105 |             (setf high mid))))))
106 | 
107 | 
108 | ;;; Misc
109 | 
110 | (defmacro define-lazy-singleton (name init &optional docstring)
111 |   "Define a function NAME, that will return a singleton object,
112 |    initialized lazily with INIT on first call.
113 |    Also define a symbol macro <NAME> that will expand to (NAME)."
114 |   (with-gensyms (singleton)
115 |     `(let (,singleton)
116 |        (defun ,name ()
117 |          ,docstring
118 |          (or ,singleton
119 |              (setf ,singleton ,init)))
120 |        (define-symbol-macro ,(mksym name :format "<~A>") (,name)))))
121 | 
122 | (defun sorted-ht-keys (test ht)
123 |   "Return hash-table keys of HT in sorted order accroding to TEST."
124 |   (sort (ht-keys ht) test :key #`(get# % ht)))
125 | 
126 | (defun shorter? (list n)
127 |   "Tests if LIST has at least N elements."
128 |   (let ((tail list))
129 |     (loop :repeat (1- n) :do (setf tail (cdr tail)))
130 |     (null tail)))
131 | 
132 | 


--------------------------------------------------------------------------------
/src/core/language-models.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (in-package #:nlp.core)
  4 | (named-readtables:in-readtable rutils-readtable)
  5 | 
  6 | 
  7 | (defclass language-model ()
  8 |   ((order :initarg :order :reader lm-order)
  9 |    (ngrams :initarg :ngrams :reader lm-ngrams))
 10 |   (:documentation
 11 |    "Language model is a collection of NGRAMS of all orders from 1 upto ORDER."))
 12 | 
 13 | (defmethod vocab ((model language-model) &key order-by)
 14 |   (vocab (get-ngrams 1 model) :order-by order-by))
 15 | 
 16 | (defgeneric make-lm (class &key 1g 2g 3g 4g 5g &allow-other-keys)
 17 |   (:documentation
 18 |    "Make instance of a langauge model of a certain CLASS
 19 |     with provided unigrams (1G), ... up to fivegrams (5G)."))
 20 | 
 21 | (defmethod make-lm (class &key 1g 2g 3g 4g 5g &allow-other-keys)
 22 |   (let ((order (cond (5g 5)
 23 |                      (4g 4)
 24 |                      (3g 3)
 25 |                      (2g 2)
 26 |                      (1g 1)
 27 |                      (t (error "No ngrams supplied")))))
 28 |     (make class
 29 |           :order order
 30 |           :ngrams
 31 |           (make-array
 32 |            (1+ order)
 33 |            :initial-contents
 34 |            (case order
 35 |              (1 (list nil
 36 |                       1g))
 37 |              (2 (list nil
 38 |                       (or 1g (error "No unigrams supplied for LM of order 2"))
 39 |                       2g))
 40 |              (3 (list nil
 41 |                       (or 1g (error "No unigrams supplied for LM of order 3"))
 42 |                       (or 2g (error "No bigrams supplied for LM of order 3"))
 43 |                       3g))
 44 |              (4 (list nil
 45 |                       (or 1g (error "No unigrams supplied for LM of order 4"))
 46 |                       (or 2g (error "No bigrams supplied for LM of order 4"))
 47 |                       (or 3g (error "No trigrams supplied for LM of order 4"))
 48 |                       4g))
 49 |              (5 (list nil
 50 |                       (or 1g (error "No unigrams supplied for LM of order 5"))
 51 |                       (or 2g (error "No bigrams supplied for LM of order 5"))
 52 |                       (or 3g (error "No trigrams supplied for LM of order 5"))
 53 |                       (or 4g (error "No fourgrams supplied for LM of order 5"))
 54 |                       5g)))))))
 55 | 
 56 | (defgeneric perplexity (model test-sentences)
 57 |   (:documentation
 58 |    "Calculate perplexity of the MODEL on the list of TEST-SENTENCES."))
 59 | 
 60 | (defmethod perplexity ((model language-model) test-sentences)
 61 |   (expt 2 (- (/ (reduce #'+ (mapcar #`(logprob ngrams %) test-sentences))
 62 |                 (reduce #'+ (mapcar #'length test-sentences))))))
 63 | 
 64 | 
 65 | ;;; Stupid Backoff LM
 66 | 
 67 | (defclass stupid-backoff-lm (language-model)
 68 |   ((backoff :initarg backoff :initform 0.4 :reader lm-backoff))
 69 |   (:documentation
 70 |    "Stupid Backoff language model."))
 71 | 
 72 | (defmethod prob ((lm language-model) (sentence string))
 73 |   (prob lm (tokenize <word-tokenizer> sentence)))
 74 | 
 75 | (defmethod prob ((lm language-model) (sentence list))
 76 |   (expt 2 (logprob lm sentence)))
 77 | 
 78 | (defmethod logprob ((lm language-model) (sentence string))
 79 |   (logprob lm (tokenize <word-tokenizer> sentence)))
 80 | 
 81 | (defmethod logprob ((model language-model) (sentence list))
 82 |   (unless sentence (return-from logprob nil))
 83 |   (with-slots (order) model
 84 |     (let ((rez 0))
 85 |       (if (= 1 order)
 86 |           (dolist (word sentence rez)
 87 |             (incf rez (logprob (get-ngrams 1 model) word)))
 88 |           (let ((s (append (cons "<S>" sentence) (list "</S>"))))
 89 |             (if (shorter? s order)
 90 |                 (logprob (get-ngrams (length s) model) s)
 91 |                 (progn
 92 |                   (do ((i 2 (1+ i)))
 93 |                       ((= i order))
 94 |                     (incf rez (cond-logprob model (sub s 0 i))))
 95 |                   (do ((tail s (rest tail)))
 96 |                       ((shorter? tail order))
 97 |                     (let ((ngram (sub tail 0 order)))
 98 |                       (unless (search '("</S>" "<S>") ngram :test 'equal)
 99 |                         (incf rez (cond-logprob model ngram)))))
100 |                   rez)))))))
101 | 
102 | (defmethod cond-prob ((model stupid-backoff-lm) ngram)
103 |   (with-accessors ((ngrams lm-ngrams) (backoff lm-backoff)) model
104 |     (let ((coef 1)
105 |           (len (length ngram)))
106 |       (loop :for i :from len :downto 1 :do
107 |          (let* ((cur (butlast ngram (- len i)))
108 |                 (freq (freq (elt ngrams i)
109 |                             (if (cdr cur) cur (car cur)))))
110 |            (if (zerop freq)
111 |                (setf coef (* coef backoff))
112 |                (return-from cond-prob
113 |                  (* coef (/ freq
114 |                             (case i
115 |                               (1 (ngrams-total-freq (elt ngrams 1)))
116 |                               (2 (freq (elt ngrams 1) (car ngram)))
117 |                               (otherwise
118 |                                (freq (elt ngrams (1- i)) (butlast ngram))))))))))
119 |       (* coef (/ (ngrams-min-freq (elt ngrams 1))
120 |                  (ngrams-total-freq (elt ngrams 1)))))))
121 | 
122 | (defmethod cond-logprob ((model stupid-backoff-lm) ngram)
123 |   (log2 (cond-prob model ngram)))
124 | 
125 | 
126 | ;;; Helper functions
127 | 
128 | (declaim (inline get-ngrams))
129 | (defun get-ngrams (order model)
130 |   "Get ngrams of a given ORDER from MODEL."
131 |   (assert (<= order (lm-order model)))
132 |   (elt (lm-ngrams model) order))


--------------------------------------------------------------------------------
/src/core/indexing.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (in-package #:nlp.core)
  4 | (named-readtables:in-readtable rutils-readtable)
  5 | (declaim (optimize (compilation-speed 2) (speed 3) (space 2) (debug 1)))
  6 | 
  7 | 
  8 | (defun index-ngrams (order words &key ignore-case)
  9 |   "Make and ngrams-table of ORDER from a list of WORDS.
 10 |    May IGNORE-CASE."
 11 |   (make 'table-ngrams :order order
 12 |         :table
 13 |         (let ((ht (make-hash-table :test (if ignore-case 'equalp 'equal)
 14 |                                    :rehash-size 10.0))
 15 |               (last-idx (1- order)))
 16 |           (do ((tail words (rest tail)))
 17 |               ((shorter? tail order))
 18 |             (let ((cur (car tail)))
 19 |               (if (= 1 order)
 20 |                   (progn
 21 |                     (when (stringp cur)
 22 |                       (cond ((string= "¶" cur)
 23 |                              (setf cur "<S>"))
 24 |                             ((and (every #'period-char-p cur)
 25 |                                   (or (null (rest tail))
 26 |                                       (upper-case-p (char (second tail) 0))))
 27 |                              (setf tail (cons nil (cons "<S>" (rest tail)))))))
 28 |                     (incf (get# cur ht 0)))
 29 |                   (incf (get# (let* ((ngram (sub tail 0 order))
 30 |                                      (suffix (rest ngram)))
 31 |                                 (cond
 32 |                                   ;; paragraph start is sentence start
 33 |                                   ((string= "¶" cur)
 34 |                                    (cons "<S>" suffix))
 35 |                                   ;; paragraph end
 36 |                                   ((string= "¶" (nth last-idx ngram))
 37 |                                    (setf tail (nthcdr (1- last-idx) tail))
 38 |                                    (append (butlast ngram) (list "</S>")))
 39 |                                   ;; sentence end
 40 |                                   ((and (upper-case-p
 41 |                                          (char (nth last-idx ngram) 0))
 42 |                                         (every #'period-char-p
 43 |                                                (nth (1- last-idx) ngram)))
 44 |                                    (setf tail (append (list nil "<S>")
 45 |                                                       (nthcdr last-idx tail)))
 46 |                                    (append (butlast ngram) (list "</S>")))
 47 |                                   ;; inside sentence
 48 |                                   (t ngram)))
 49 |                               ht 0)))))
 50 |           (when (= order 1)
 51 |             (when (get# "<S>" ht)
 52 |               (set# "</S>" ht (decf (get# "<S>" ht)))))
 53 |           ht)))
 54 | 
 55 | (defun index-context-freqs (words &key ignore-order)
 56 |   "Create a table of weighted conditional frequencies
 57 |    of 1-word contexts to each side of a word
 58 |    (if IGNORE-ORDER both left_right and right_left
 59 |    are normalized and treated as the same context)
 60 |    for each distinct word in WORDS."
 61 |   ;; TODO: generalize for broader contexts
 62 |   (let ((ctxs (make-hash-table :test 'equal)))
 63 |     (loop :for (prev cur next) :on (cons "<S>" (append words (list "</S>")))
 64 |        :while next :do
 65 |        (unless (get# cur ctxs)
 66 |          (set# cur ctxs (make-hash-table :test 'equal)))
 67 |        (when (and (upper-case-p (char cur 0))
 68 |                   (ending-word-p prev))
 69 |          (setf prev "<S>"))
 70 |        (when (and (upper-case-p (char next 0))
 71 |                   (ending-word-p cur))
 72 |          (setf next "</S>"))
 73 |        (let ((prev_next (if (and ignore-order (string< next prev))
 74 |                             (strcat next "_" prev)
 75 |                             (strcat prev "_" next))))
 76 |          (set# prev_next (get# cur ctxs)
 77 |                (1+ (get# prev_next (get# cur ctxs) 0)))))
 78 |     (normalize-freqs ctxs)))
 79 | 
 80 | (defun index-prefix-transition-freqs (words &key (n 1))
 81 |   "Create a table of weighted conditional frequencies
 82 |    of next words for each distinct reversed N-word sequence in WORDS."
 83 |   (let ((transitions (make-hash-table :test 'equalp))
 84 |         (limit (length words))
 85 |         (count 0))
 86 |     ;; traversing the list of words from end
 87 |     (loop :for tail :on (reverse (append (make-list n) words))
 88 |        :while (< count limit) :do
 89 |        (incf count)
 90 |        (let* ((word (car tail))
 91 |               (prefix (sub tail 1 (1+ n))))
 92 |          (when (and (> n 1) (string= "¶" (car prefix)))
 93 |            (setf prefix (cons "¶" (make-list (1- n)))))
 94 |          (unless (get# prefix transitions)
 95 |            (set# prefix transitions (make-hash-table :test 'equal)))
 96 |          (set# word (get# prefix transitions)
 97 |                (1+ (get# word (get# prefix transitions) 0)))))
 98 |     (normalize-freqs transitions)))
 99 | 
100 | (defun index-word-transition-freqs (words)
101 |   "Create a table of weighted conditional frequencies
102 |    of next words for each distinct word in WORDS."
103 |   (let ((transitions (make-hash-table :test 'equalp))
104 |         (word-vec (make-array (1+ (length words))
105 |                               :initial-contents (cons "¶" words))))
106 |     (dotimes (i (1- (length words)))
107 |       (let ((prev (elt word-vec i))
108 |             (cur (elt word-vec (+ i 1))))
109 |         (unless (get# prev transitions)
110 |           (set# prev transitions (make-hash-table :test 'equal)))
111 |        (set# cur (get# prev transitions)
112 |              (1+ (get# cur (get# prev transitions) 0)))))
113 |     (normalize-freqs transitions)))
114 | 
115 | 
116 | ;;; Collocations
117 | 
118 | (defun find-collocations (bigrams &key (n 20))
119 |   "Find up to N strongest collocations in BIGRAMS."
120 |   (let ((rez (make-hash-table :test 'equal))
121 |         (left (make-hash-table :test 'equal))
122 |         (right (make-hash-table :test 'equal))
123 |         (total (ngrams-total-freq bigrams)))
124 |     (dotable (ngram freq (ngrams-table bigrams))
125 |       (ds-bind (l r) ngram
126 |         (set# l left (+ freq (get# l left 0)))
127 |         (set# r right (+ freq (get# r right 0)))))
128 |     (dotable (ngram freq (ngrams-table bigrams))
129 |       (unless (reduce #'or2
130 |                       (mapcar #`(member % *stopwords-en* :test 'string-equal)
131 |                               ngram))
132 |         (let ((lfreq (- (get# (car ngram) left) freq))
133 |               (rfreq (- (get# (cadr ngram) right) freq)))
134 |           (set# ngram rez
135 |                 (log-likelihood-ratio freq lfreq
136 |                                       rfreq (- total lfreq rfreq freq))))))
137 |     (take n (sorted-ht-keys '> rez))))
138 | 
139 | 
140 | ;;; Helpers
141 | 
142 | (defun normalize-freqs (ht-of-hts)
143 |   "For each table in HT-OF-HTS normalize all the values.
144 |    Returns the modified HT-OF-HTS."
145 |   (maphash #`(let ((total (reduce '+ (ht-vals %%))))
146 |                (dotable (k v %%)
147 |                  (set# k %% (/ v total))))
148 |            ht-of-hts)
149 |   ht-of-hts)


--------------------------------------------------------------------------------
/nltk/ch1-1.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (in-package #:nltk)
  4 | (named-readtables:in-readtable rutils-readtable)
  5 | 
  6 | 
  7 | ;;; Texts, we'll be working with
  8 | 
  9 | (defclass text ()
 10 |   ((name :initarg :name)
 11 |    (raw :initarg :raw :accessor text-raw)
 12 |    (words :accessor text-words)
 13 |    (ctxs :accessor text-ctxs)
 14 |    (transitions :accessor text-transitions)
 15 |    (dispersion :accessor text-dispersion)))
 16 | 
 17 | (defmethod print-object ((text text) stream)
 18 |   (with-slots (name raw) text
 19 |     (let ((len (length raw)))
 20 |       (format stream "#<~A ~A... ~A>"
 21 |               name (subseq raw 0 (min len 20)) len))))
 22 | 
 23 | (defmethod slot-unbound (class (obj text) (slot (eql 'words)))
 24 |   (with-slots (raw words) obj
 25 |     (format t "~&Tokenizing text...~%")
 26 |     (prog1 (setf words (mapcan #`(cons "¶" (tokenize <word-tokenizer> %))
 27 |                                (tokenize <paragraph-splitter> raw)))
 28 |       (format t "Number of words: ~A~%" (length words)))))
 29 | 
 30 | (defmethod slot-unbound (class (obj text) (slot (eql 'ctxs)))
 31 |   (with-slots (words ctxs) obj
 32 |     (format t "~&Building word contexts...~%")
 33 |     (prog1 (setf ctxs (index-context-freqs words))
 34 |       (format t "Number of unique words: ~A~%" (length (ht-keys ctxs))))))
 35 | 
 36 | (defmethod slot-unbound (class (obj text) (slot (eql 'transitions)))
 37 |   (rebuild-transitions obj 2))
 38 | 
 39 | (defmethod slot-unbound (class (obj text) (slot (eql 'dispersion)))
 40 |   (with-slots (words dispersion) obj
 41 |     (format t "~&Building dispersion table...~%")
 42 |     (let ((ht (make-hash-table :test 'equal)))
 43 |       (doindex (idx word words)
 44 |         (set# word ht (cons idx (get# word ht))))
 45 |       (prog1 (setf dispersion ht)
 46 |         (format t "Number of words: ~A~%" (length (ht-keys dispersion)))))))
 47 | 
 48 | (defun nltk-text-file (name)
 49 |   (merge-pathnames (fmt "nltk/data/~(~A~).txt" name)
 50 |                    nutil:+project-root+))
 51 | 
 52 | (defvar *texts* (make-hash-table))
 53 | 
 54 | (defun load-nltk-texts (&optional (dir "data/"))
 55 |   "Load and cache texts, that we'll use in the examples."
 56 |   (dolist (text '(:moby :sense :genesis :inaugural :nps-chat))
 57 |     (set# text *texts*
 58 |           (make 'text :name text
 59 |                 :raw (string-trim +white-chars+
 60 |                                   (read-file (nltk-text-file text))))))
 61 |   (maphash #`(print %%) *texts*))
 62 | 
 63 | 
 64 | ;;; Main functions
 65 | 
 66 | (defun concordance (text word &key (width 25) pass-newlines)
 67 |   "Print contexts (up to WIDTH chars) of WORD usage in TEXT.
 68 |    If PASS-NEWLINES isn't set, the context will be shown
 69 |    up to the closest newline."
 70 |   (print-word-in-contexts word (text-raw text)
 71 |                           :width 30 :pass-newlines pass-newlines))
 72 | 
 73 | (defun similar (text word &key (n 20))
 74 |   "Find N most similar words to WORD in TEXT."
 75 |   (let* ((ctxs (text-ctxs text))
 76 |          (ctx (get# word ctxs))
 77 |          (rez (make-hash-table :test 'equal)))
 78 |     (maphash #`(let ((common (match-ctxs ctx %%)))
 79 |                  (unless (or (string= word %)
 80 |                              (zerop (hash-table-count common)))
 81 |                    (set# % rez (reduce #'+ (ht-vals common)))))
 82 |              ctxs)
 83 |     (take n (sorted-ht-keys '> rez))))
 84 | 
 85 | (defun common-contexts (text &rest words)
 86 |   "Find matching contexts between WORDS in TEXT."
 87 |   (with-slots (ctxs) text
 88 |     (when-it (reduce #`(match-ctxs (get# %% ctxs) %)
 89 |                      (rest words)
 90 |                      :initial-value (get# (car words) ctxs))
 91 |       (sorted-ht-keys '> it))))
 92 | 
 93 | (defun generate (text &key (n 20) (order 2))
 94 |   "Generate random text of N words, based on TEXT."
 95 |   (with-slots (transitions) text
 96 |     (string-trim (append +white-chars+ +newline-chars+)
 97 |                  (fmt "~{~A ~}"
 98 |                       (generate-text (make 'markov-chain-generator :order order)
 99 |                                      (if (= order
100 |                                             (length (car (ht-keys transitions))))
101 |                                          transitions
102 |                                          (rebuild-transitions text order))
103 |                                      n)))))
104 | 
105 | (defun rebuild-transitions (text n)
106 |   "Rebuild transition index"
107 |   (with-slots (words transitions) text
108 |     (format t "~&Building prefix transitions index of length ~A...~%" n)
109 |     (prog1 (setf transitions (index-prefix-transition-freqs words :n n))
110 |       (format t "Number of prefixes: ~A~%" (length (ht-keys transitions))))))
111 | 
112 | 
113 | (defun dispersion-plot (text &rest words)
114 |   "Plot dispersion of WORDS in TEXT."
115 |   (plot-dispersion words (dump-data-for-plot (reverse words)
116 |                                              (text-dispersion text))))
117 | 
118 | (defun lexical-diversity (text)
119 |   "Calculate lexical diversity measure of TEXT."
120 |   (with-slots (words) text
121 |     (/ (float (length words))
122 |        (hash-table-count (uniq words :raw t)))))
123 | 
124 | (defun percentage (count total)
125 |   "Return percentage of count / total."
126 |   (/ (* 100.0 count) total))
127 | 
128 | 
129 | ;;; Helper functions
130 | 
131 | (defun uniq (list &key raw case-insensitive)
132 |   "Return only unique elements from LIST either as a new list
133 |    or as hash-table if RAW is set. Can be CASE-INSENSITIVE."
134 |   (let ((uniqs (make-hash-table :test (if case-insensitive 'equalp 'equal))))
135 |     (dolist (elt list)
136 |       (set# elt uniqs t))
137 |     (if raw uniqs (ht-keys uniqs))))
138 | 
139 | (defun match-ctxs (word1-ctx word2-ctx)
140 |   "Find the intersection between WORD1-CTX and WORD2-CTX tables
141 |    and for each common context calculate the commonality weight."
142 |   (let ((rez (make-hash-table :test 'equal)))
143 |     (when (and word1-ctx word2-ctx)
144 |       (dolist (k (intersection (ht-keys word1-ctx) (ht-keys word2-ctx)
145 |                                :test 'string=))
146 |         (set# k rez (* (get# k word1-ctx)
147 |                        (get# k word2-ctx)))))
148 |     rez))
149 | 
150 | (defun dump-data-for-plot (words dispersion-table)
151 |   "Dump data from DISPERSION-TABLE for WORDS into a temporary file
152 |    and return its name."
153 |   (let ((filename (fmt "/tmp/~A" (gensym))))
154 |     (with-out-file (out filename)
155 |       (format out "0~t0~t~%")
156 |       (doindex (i word words)
157 |         (dolist (idx (get# word dispersion-table))
158 |           (format out "~A~t~A~t~A~%" idx (1+ i) word)))
159 |       (format out "0~t~A~t~%" (1+ (length words))))
160 |     filename))
161 | 
162 | (defun plot-dispersion (words file)
163 |   "Plot WORDS dispersion data from FILE."
164 |   (cgn:start-gnuplot)
165 |   (cgn:format-gnuplot "set title \"Lexical Dispersion Plot\"")
166 |   (cgn:format-gnuplot "plot \"~A\" using 1:2:yticlabels(3) title \"\"" file))
167 | 
168 | 
169 | ;;; Brown corpus information
170 | 
171 | #+manually (
172 | (format t "       Genre        |  Tokens  |  Types  |  Lexical Diversity")
173 | (format t "~%--------------------+----------+---------+--------------------")
174 | (maphash #`(with-accessors ((tokens (corpus-text-tokens +brow-corpus))) %%
175 |              (let* ((words (mapcar #'ncorp::token-word (flatten tokens)))
176 |                     (words-count (length words))
177 |                     (types-count (hash-table-count (uniq words :raw t))))
178 |                (format t "~&~20A|  ~7A |  ~6A | ~6,1F~%"
179 |                        (subseq (symbol-name %)
180 |                                0 (min 19 (length (symbol-name %))))
181 |                        words-count
182 |                        types-count
183 |                        (/ (* 1.0 words-count) types-count))))
184 |          +brown-corpus+)
185 | )


--------------------------------------------------------------------------------
/src/core/tokenization.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (in-package #:nlp.core)
  4 | (named-readtables:in-readtable rutils-readtable)
  5 | 
  6 | 
  7 | (defgeneric tokenize (tokenizer string)
  8 |   (:documentation
  9 |    "Tokenize STRING with TOKENIZER. Outputs 2 values:
 10 | 
 11 |     - list of words
 12 |     - list of spans as beg-end cons pairs
 13 |    "))
 14 | 
 15 | ;; (defgeneric stream-tokenize (tokenizer input output &optional span-output)
 16 | ;;   (:documentation
 17 | ;;    "Tokenize INPUT with TOKENIZER and writes to OUTPUT one word per line.
 18 | ;;     Also can write to SPAN-OUTPUT beg-end cons pairs related to words.
 19 | ;;
 20 | ;;     Usage example:
 21 | ;;
 22 | ;;         (let ((pipe (make-two-way-stream
 23 | ;;                       (make-input-string-stream) (make-output-string-stream))))
 24 | ;;           (bt:make-thread #`(stream-tokenize *standard-input* pipe))
 25 | ;;           (loop :for line := (read-line pipe nil) :while line :do
 26 | ;;              (print line)))
 27 | ;;    "))
 28 | 
 29 | 
 30 | (defclass tokenizer ()
 31 |   ()
 32 |   (:documentation
 33 |    "Base class for tokenizers."))
 34 | 
 35 | (defmethod tokenize :around ((tokenizer tokenizer) string)
 36 |   "Pre-split text into lines and tokenize each line separately."
 37 |   (let ((offset 0)
 38 |         words spans)
 39 |     (loop :for line :in (split-sequence #\Newline string) :do
 40 |        (mv-bind (ts ss) (call-next-method tokenizer line)
 41 |          (setf words (nconc words ts)
 42 |                spans (nconc spans (mapcar #`(cons (+ (car %) offset)
 43 |                                                   (+ (cdr %) offset))
 44 |                                           ss)))
 45 |          (incf offset (1+ (length line)))))
 46 |     (values words
 47 |             spans)))
 48 | 
 49 | 
 50 | ;;; Word tokenization
 51 | 
 52 | (defclass regex-word-tokenizer (tokenizer)
 53 |   ((regex :accessor tokenizer-regex :initarg :regex
 54 |           :initform
 55 |           (re:create-scanner
 56 |            "(\\w+|[!\"#$%&'*+,./:;<=>?@^`~…\\(\\)⟨⟩{}\\[\\|\\]‒–—―«»“”‘’¶-])")
 57 |           :documentation
 58 |           "A simpler variant would be [^\\s]+
 59 |            — it doesn't split punctuation, yet sometimes it's desirable."))
 60 |   (:documentation
 61 |    "Regex-based word tokenizer."))
 62 | 
 63 | (defmethod tokenize ((tokenizer regex-word-tokenizer) string)
 64 |   (loop :for (beg end) :on (re:all-matches (tokenizer-regex tokenizer) string)
 65 |                        :by #'cddr
 66 |      :collect (subseq string beg end) :into words
 67 |      :collect (cons beg end) :into spans
 68 |      :finally (return (values words
 69 |                               spans))))
 70 | 
 71 | (define-lazy-singleton word-tokenizer (make 'regex-word-tokenizer)
 72 |   "Basic word tokenizer.")
 73 | 
 74 | (define-lazy-singleton word-chunker
 75 |     (make 'regex-word-tokenizer :regex (re:create-scanner "[^\\s]+"))
 76 |   "Dumb word tokenizer, that will not split punctuation from words.")
 77 | 
 78 | 
 79 | (defclass treebank-word-tokenizer (tokenizer)
 80 |   ()
 81 |   (:documentation
 82 |    "The Treebank tokenizer uses regular expressions to tokenize text
 83 |     as in Penn Treebank. It's a port of Robert MacIntyre's tokenizer
 84 |     (see: <http://www.cis.upenn.edu/~treebank/tokenizer.sed>).
 85 |     It assumes that the text has already been split into sentences.
 86 | 
 87 |     This tokenizer performs the following steps:
 88 | 
 89 |     - split standard contractions: don't -> do n't, they'll -> they 'll
 90 |     - treat most punctuation characters as separate tokens
 91 |     - split off commas and single quotes, when followed by whitespace
 92 |     - separate periods that appear at the end of line
 93 |    "))
 94 | 
 95 | ;; (let ((contractions-regex
 96 | ;;        (re:create-scanner (s+ "("
 97 | ;;                               "([^' ])('[sS]|'[mM]|'[dD]|') "
 98 | ;;                               "|"
 99 | ;;                               "([^' ])('ll|'re|'ve|n't|) "
100 | ;;                               "|"
101 | ;;                               "\\b("
102 | ;;                               "(can)(not)"
103 | ;;                               "|(d)('ye)"
104 | ;;                               "|(gim)(me)"
105 | ;;                               "|(gon)(na)"
106 | ;;                               "|(got)(ta)"
107 | ;;                               "|(lem)(me)"
108 | ;;                               "|(wan)(na)"
109 | ;;                               "|(mor)('m)"
110 | ;;                               ")\\b"
111 | ;;                               "|"
112 | ;;                               " ('t)(is|was)\\b"
113 | ;;                               ")"
114 | ;;                           :case-insensitive-mode t))
115 | ;;       (contractions3-regex (re:create-scanner
116 | ;;                                               :case-insensitive-mode t)))
117 | ;; (defmethod tokenize ((tokenizer treebank-word-tokenizer) string)
118 | ;;   (re-setf string
119 | ;;            ;; starting quotes
120 | ;;            (re:regex-replace-all "^\"" "``")
121 | ;;            (re:regex-replace-all "(``)" " \\1 ")
122 | ;;            (re:regex-replace-all "([ (\[{<])\"" "\\1 `` ")
123 | ;;            (re:regex-replace-all "^\"" "``")
124 | ;;            (re:regex-replace-all "^\"" "``")
125 | ;;            ;; punctuation
126 | ;;            (re:regex-replace-all "([:,])([^\\d])" " \\1 \\2 ")
127 | ;;            (re:regex-replace-all "\\.\\.\\." " ... ")
128 | ;;            (re:regex-replace-all "[;@#$%&?!]" " \\0 ")
129 | ;;            (re:regex-replace-all "([^\\.])(\\.)([\]\)}>\"']*)\\s*$" "\\1 \\2\\3 ")
130 | ;;            (re:regex-replace-all "--" " \\0 ")
131 | ;;            (re:regex-replace-all "[\\]\\[\\(\\)\\{\\}\\<\\>]" " \\0 ")
132 | ;;            (strcat " " " ")
133 | ;;            ;; quotes
134 | ;;            (re:regex-replace-all "([^'])(') " "\\1 \\2 ")
135 | ;;            (re:regex-replace-all "\"" " '' ")
136 | ;;            (re:regex-replace-all "(\\S)(\\'\\')" "\\1 \\2 ")
137 | 
138 | ;;            (re:regex-replace-all contractions-regex "\\1 \\2 ")
139 | ;;            (re:regex-replace-all " +" " ")
140 | ;;            (string-trim " "))
141 | ;;   (unless (blankp string)
142 | ;;     (setf text (strcat string " ")))
143 | ;;   (tokenize <word-chunker> string))
144 | 
145 | ;;; Sentence splitting
146 | 
147 | (defclass baseline-sentence-tokenizer (tokenizer)
148 |   ()
149 |   (:documentation
150 |    "Basic tokenizer for sentence splitting."))
151 | 
152 | (defparameter +abbrevs-with-dot+
153 |   (list-from-file (data-file "abbrevs-with-dot.txt"))
154 |   "Widely-used abbreviations ending in dot.")
155 | 
156 | (defmethod tokenize ((tokenizer baseline-sentence-tokenizer) string)
157 |   (mv-bind (words word-spans)
158 |       (tokenize (make 'regex-word-tokenizer :regex "[^\\s]+")
159 |                 (substitue #\¶ #\Newline string))
160 |     (let ((beg 0)
161 |           sentences spans)
162 |       (loop :for ws :on words :and ss :on word-spans :do
163 |          (let ((word (first ts))
164 |                (span (first ss)))
165 |            (when (or (null (rest ts))
166 |                      (and (member (char word (1- (length word)))
167 |                                   '(#\. #\? #\! #\¶))
168 |                           (not (member word +abbrevs-with-dot+
169 |                                        :test #'string-equal))
170 |                           (and-it (second ts)
171 |                                   (upper-case-p (char it 0)))))
172 |              (push (subseq string beg (cdr span)) sentences)
173 |              (push (cons beg (cdr span)) spans)
174 |              (setf beg (car (second ss))))))
175 |       (values (reverse sentences)
176 |               (reverse spans)))))
177 | 
178 | (define-lazy-singleton sentence-tokenizer (make 'baseline-sentence-tokenizer)
179 |   "Basic sentence splitter.")
180 | 
181 | 
182 | ;;; Paragraph splitting
183 | 
184 | (defclass doublenewline-paragraph-splitter ()
185 |   ()
186 |   (:documentation
187 |    "Paragraph tokenizer that splits text on double newlines
188 |     and removes single newlines."))
189 | 
190 | (defmethod tokenize ((tokenizer doublenewline-paragraph-splitter) string)
191 |   (let ((newline-regex (re:create-scanner
192 |                         (fmt "(~C|[~C~C]{1,2})" #\Newline #\Return #\Linefeed))))
193 |     (mapcar #`(fmt "~{~A ~}" %)
194 |             (split-sequence-if #'blankp
195 |                                (re:split newline-regex string)
196 |                                :remove-empty-subseqs t))))
197 | 
198 | (define-lazy-singleton paragraph-splitter
199 |     (make 'doublenewline-paragraph-splitter)
200 |   "Basic paragraph splitter.")
201 | 
202 | 
203 | ;;; Helpers
204 | 
205 | (defmacro re-setf (var &body body)
206 |   "For each clause in BODY wrap it in `(setf var (clause arg1 var args))`."
207 |   `(progn
208 |      ,@(mapcar (lambda (clause)
209 |                  `(setf ,var (,(car clause)
210 |                                ,@(when-it (cadr clause) (list it))
211 |                                ,var
212 |                                ,@(when-it (cadr clause) (nthcdr 2 clause)))))
213 |                body)))
214 | 


--------------------------------------------------------------------------------
/src/core/ngrams.lisp:
--------------------------------------------------------------------------------
  1 | ;;; (c) 2013 Vsevolod Dyomkin
  2 | 
  3 | (in-package #:nlp.core)
  4 | (named-readtables:in-readtable rutils-readtable)
  5 | 
  6 | 
  7 | ;;; Abstract Ngrams
  8 | 
  9 | (defclass ngrams ()
 10 |   ((order :initarg :order :reader ngrams-order)
 11 |    (count :reader ngrams-count)
 12 |    (max-freq :reader ngrams-max-freq)
 13 |    (min-freq :reader ngrams-min-freq)
 14 |    (total-freq :reader ngrams-total-freq))
 15 |   (:documentation
 16 |    "An abstract ngrams interface."))
 17 | 
 18 | (defmethod print-object ((ngrams ngrams) stream)
 19 |   (print-unreadable-object (ngrams stream :type t :identity t)
 20 |     (if (slot-boundp ngrams 'order)
 21 |         (with-accessors ((order ngrams-order) (count ngrams-count)
 22 |                          (total-freq ngrams-total-freq)) ngrams
 23 |           (format stream "order:~A count:~A outcomes:~A"
 24 |                   order count total-freq))
 25 |         (format stream "not initialized"))))
 26 | 
 27 | (defgeneric ngrams-eq (ngrams)
 28 |   (:documentation
 29 |    "Get the equality predicate of NGRAMS (can be EQUAL or EQUALP)."))
 30 | 
 31 | (defgeneric ngrams-pairs (ngrams &key order-by)
 32 |   (:documentation
 33 |    "Get the alist of all ngrams with their frequencies in NGRAMS,
 34 |     possibly ordered by ORDER-BY predicate (e.g. < or >)."))
 35 | 
 36 | (defgeneric vocab (ngrams &key order-by)
 37 |   (:documentation
 38 |    "Get the list of all ngrams in NGRAMS,
 39 |     possibly ordered by ORDER-BY predicate (e.g. < or >)."))
 40 | 
 41 | (defgeneric freq (ngrams ngram)
 42 |   (:documentation
 43 |    "Get the NGRAM frequency in NGRAMS.")
 44 |   (:method :around ((ngrams ngrams) (ngram string))
 45 |     (if (> (ngrams-order ngrams) 1)
 46 |         (freq ngrams (tokenize-ngram ngrams ngram))
 47 |         (call-next-method))))
 48 | 
 49 | (defgeneric prob (ngrams ngram)
 50 |   (:documentation
 51 |    "Get the NGRAM probability in NGRAMS.")
 52 |   (:method ((ngrams ngrams) ngram)
 53 |     (/ (freq ngrams ngram)
 54 |        (ngrams-total-freq ngrams)))
 55 |   (:method :around ((ngrams ngrams) (ngram string))
 56 |     (if (> (ngrams-order ngrams) 1)
 57 |         (prob ngrams (tokenize-ngram ngrams ngram))
 58 |         (call-next-method))))
 59 | 
 60 | (defgeneric logprob (ngrams ngram)
 61 |   (:documentation
 62 |    "Get the log (to base 2) of NGRAM probability in NGRAMS.")
 63 |   (:method ((ngrams ngrams) ngram)
 64 |     (let ((prob (prob ngrams ngram)))
 65 |       (if (zerop prob)
 66 |           nil
 67 |           (* (log prob) #.(/ 1 (log 2))))))
 68 |   (:method :around ((ngrams ngrams) (ngram string))
 69 |     (if (> (ngrams-order ngrams) 1)
 70 |         (logprob ngrams (tokenize-ngram ngrams ngram))
 71 |         (call-next-method))))
 72 | 
 73 | (defgeneric cond-prob (ngrams ngram)
 74 |   (:documentation
 75 |    "Get the NGRAM conditional probability in NGRAMS.
 76 |     By conditional probability we mean the probability of occurrence
 77 |     of the last word given the previous words.")
 78 |   (:method :around ((ngrams ngrams) ngram)
 79 |     (if (= 1 (ngrams-order ngrams))
 80 |         1
 81 |         (call-next-method)))
 82 |   (:method :around ((ngrams ngrams) (ngram string))
 83 |     (if (> (ngrams-order ngrams) 1)
 84 |         (cond-prob ngrams (tokenize-ngram ngrams ngram))
 85 |         (call-next-method))))
 86 | 
 87 | (defgeneric cond-logprob (ngrams ngram)
 88 |   (:documentation
 89 |    "Get the log of NGRAM conditional probability in NGRAMS.
 90 |     By conditional probability we mean the probability of occurrence
 91 |     of the last word given the previous words.")
 92 |   (:method :around ((ngrams ngrams) (ngram string))
 93 |     (if (> (ngrams-order ngrams) 1)
 94 |         (cond-logprob ngrams (tokenize-ngram ngrams ngram))
 95 |         (call-next-method))))
 96 | 
 97 | (defgeneric freqs (ngrams &rest ngrams-list)
 98 |   (:documentation
 99 |    "Get the list of frequencies of ngrams from NGRAMS-LIST in NGRAMS.")
100 |   (:method (ngrams &rest ngrams-list)
101 |     (mapcar #`(freq ngrams %) ngrams-list)))
102 | 
103 | (defgeneric probs (ngrams &rest ngrams-list)
104 |   (:documentation
105 |    "Get the list of probabilities of ngrams from NGRAMS-LIST in NGRAMS.")
106 |   (:method (ngrams &rest ngrams-list)
107 |     (mapcar #`(prob ngrams %) ngrams-list)))
108 | 
109 | (defgeneric logprobs (ngrams &rest ngrams-list)
110 |   (:documentation
111 |    "Get the list of logs of probability of ngrams from NGRAMS-LIST in NGRAMS.")
112 |   (:method (ngrams &rest ngrams-list)
113 |     (mapcar #`(logprob ngrams %) ngrams-list)))
114 | 
115 | (defgeneric cond-probs (ngrams &rest ngrams-list)
116 |   (:documentation
117 |    "Get the conditional probabilities of ngrams from NGRAMS-LIST in NGRAMS.
118 |     By conditional probability we mean the probability of occurrence
119 |     of the last word given the previous words.")
120 |   (:method ((ngrams ngrams) &rest ngrams-list)
121 |     (mapcar #`(cond-prob ngrams %) ngrams-list)))
122 | 
123 | (defgeneric cond-logprobs (ngrams &rest ngrams-list)
124 |   (:documentation
125 |    "Get the logs of conditional probability of ngrams from NGRAMS-LIST in NGRAMS.
126 |     By conditional probability we mean the probability of occurrence
127 |     of the last word given the previous words.")
128 |   (:method ((ngrams ngrams) &rest ngrams-list)
129 |     (mapcar #`(cond-logprob ngrams %) ngrams-list)))
130 | 
131 | (defgeneric top-ngram (ngrams)
132 |   (:documentation
133 |    "Get some ngram with the highest frequency in NGRAMS."))
134 | 
135 | (defgeneric hapaxes (ngrams)
136 |   (:documentation
137 |    "Get all the ngrams with the lowest frequency in NGRAMS.
138 |     Second value is the frequency itself."))
139 | 
140 | 
141 | ;;; Table-based ngrams
142 | 
143 | (defclass table-ngrams (ngrams)
144 |   ((table :initform (make-hash-table :test 'equal) :initarg :table
145 |           :reader ngrams-table))
146 |   (:documentation
147 |    "Ngrams with hash-table source."))
148 | 
149 | (defmethod initialize-instance :after ((ngrams table-ngrams) &key)
150 |   (with-slots (table order max-freq min-freq total-freq) ngrams
151 |     (check-type table hash-table)
152 |     (assert (member (hash-table-test table) '(equal equalp)))
153 |     (with-hash-table-iterator (gen-fn table)
154 |       (when-it (nth-value 2 (gen-fn))
155 |         (setf total-freq (setf max-freq (setf min-freq it)))
156 |         (loop
157 |            (mv-bind (next? _ freq) (gen-fn)
158 |              (unless next? (return))
159 |              (incf total-freq freq)
160 |              (when (< freq min-freq)
161 |                (setf min-freq freq))
162 |              (when (> freq max-freq)
163 |                (setf max-freq freq))))))))
164 | 
165 | (defmethod ngrams-count ((ngrams table-ngrams))
166 |   (hash-table-count (ngrams-table ngrams)))
167 | 
168 | (defmethod ngrams-eq ((ngrams table-ngrams))
169 |   (hash-table-test (ngrams-table ngrams)))
170 | 
171 | (defmethod vocab ((ngrams table-ngrams) &key order-by)
172 |   (with-slots (table) ngrams
173 |     (if order-by
174 |         (mapcar #'car (sort (ngrams-pairs ngrams) order-by :key #'cdr))
175 |         (ht-keys table))))
176 | 
177 | (defmethod ngrams-pairs ((ngrams table-ngrams) &key order-by)
178 |   (with-slots (table) ngrams
179 |     (if order-by
180 |         (sort (ht->alist table) order-by :key #'cdr)
181 |         (ht->alist table))))
182 | 
183 | ;; (defmethod ngrams-pairs ((ngrams table-ngrams))
184 | ;;   (let ((total 0)
185 | ;;         (freq 0)
186 | ;;         (eq-test (ngrams-eq ngrams))
187 | ;;         (prefix (butlast ngram)))
188 | ;;     (dolist (ng (vocab ngrams))
189 | ;;       (cond ((funcall eq-test ngram ng)
190 | ;;              (incf total (setf freq (freq ng))))
191 | ;;             ((funcall eq-test prefix (butlast ng))
192 | ;;              (incf total (freq ng)))))
193 | ;;     (if (zerop total) 0
194 | ;;         (/ freq total))))
195 | 
196 | (defmethod freq ((ngrams table-ngrams) ngram)
197 |   (get# ngram (ngrams-table ngrams) 0))
198 | 
199 | (defmethod freqs ((ngrams table-ngrams) &rest ngrams-list)
200 |   (mapcar #`(freq (ngrams-table ngrams) %)
201 |           ngrams-list))
202 | 
203 | (defmethod probs ((ngrams table-ngrams) &rest ngrams-list)
204 |   (mapcar #`(prob (ngrams-table ngrams) %)
205 |           ngrams-list))
206 | 
207 | (defmethod cond-prob ((ngrams hash-table) ngram)
208 |   (let ((total 0)
209 |         (freq 0)
210 |         (eq-test (ngrams-eq ngrams))
211 |         (prefix (butlast ngram)))
212 |     (maphash #`(cond ((funcall eq-test ngram %)
213 |                       (incf total (setf freq %%)))
214 |                      ((funcall eq-test prefix (butlast %))
215 |                       (incf total %%)))
216 |              (ngrams-source ngrams))
217 |     (if (zerop total) 0
218 |         (/ freq total))))
219 | 
220 | (defmethod top-ngram ((ngrams table-ngrams))
221 |   (with-slots (table max-freq) ngrams
222 |     (dotable (ngram freq table)
223 |       (when (= (max-freq ngrams) freq)
224 |         (return ngram)))))
225 | 
226 | (defmethod hapaxes ((ngrams table-ngrams))
227 |   (let (rez)
228 |     (with-slots (table min-freq) ngrams
229 |       (dotable (ngram freq table)
230 |         (when (= min-freq freq)
231 |           (push ngram rez)))
232 |       (values rez
233 |               min-freq))))
234 | 
235 | 
236 | ;;; Helper functions
237 | 
238 | (defun tokenize-ngram (ngrams str)
239 |   "Transform string STR to a list if necessary (depending of order of NGRAMS)."
240 |   (if (> (ngrams-order ngrams) 1)
241 |       (tokenize <word-tokenizer> str)
242 |       str))
243 | 


--------------------------------------------------------------------------------
/nltk/ch1-2.md:
--------------------------------------------------------------------------------
  1 | # NLTK 1.3 - Computing with Language: Simple Statistics
  2 | 
  3 | Most of the remaining parts of the first chapter of NLTK book serve as
  4 | an introduction to Python in the context of text processing. I won't
  5 | translate that to Lisp, because there're much better resources
  6 | explaining how to use Lisp properly. First and foremost I'd refer
  7 | anyone interested to the appropriate chapters of
  8 | [Practical Common Lisp](http://gigamonkeys.com/book):
  9 | 
 10 | - [List Processing](http://gigamonkeys.com/book/they-called-it-lisp-for-a-reason-list-processing.html)
 11 | - [Collections](http://gigamonkeys.com/book/collections.html)
 12 | - [Variables](http://gigamonkeys.com/book/variables.html)
 13 | - [Macros: Standard Control Constructs](http://gigamonkeys.com/book/macros-standard-control-constructs.html)
 14 | 
 15 | It's only worth noting that Lisp has a different notion of lists, than
 16 | Python. Lisp's lists are linked lists, while Python's are essentially
 17 | vectors. Lisp also has vectors as a separate data-structure, and it
 18 | also has multidimensional arrays (something Python mostly lacks). And
 19 | the set of Lisp's list operations is somewhat different from
 20 | Python's. List is the default sequence data-structure, but you should
 21 | understand its limitations and know, when to switch to vectors (when
 22 | you will have a lot of elements and often access them at random). Also
 23 | Lisp doesn't provide Python-style syntactic sugar for slicing and
 24 | dicing lists, although all the operations are there in the form of
 25 | functions. The only thing which isn't easily reproducible in Lisp is
 26 | assigning to a slice:
 27 | 
 28 |     >>> sent[1:9] = ['Second', 'Third']
 29 |     >>> sent
 30 |     ['First', 'Second', 'Third', 'Last']
 31 | 
 32 | There's `replace` but it can't shrink a sequence:
 33 | 
 34 |     CL-USER> (defvar sent '(1 2 3 4 5 6 7 8 9 0))
 35 |     CL-USER> (replace sent '("Second" "Third") :start1 1 :end1 9)
 36 |     (1 "Second" "Third" 4 5 6 7 8 9 0)
 37 | 
 38 | ## Ngrams
 39 | 
 40 | So, the only part worth discussing here is statistics.
 41 | 
 42 | Let's start with a __frequency distribution__. We have already used
 43 | something similar in the previous part for text generation, but it was
 44 | very basic and tailored to the task. Now, it's time to get into some
 45 | serious language modeling and discuss a more general-purpose
 46 | implementation.
 47 | 
 48 | Such modeling is accomplished via collecting of large amounts of
 49 | statistical data about words and their sequences appearances in
 50 | texts. These sequences are called __ngrams__. In a nutshell, you can
 51 | think of ngrams distribution as a table mapping ngram sequences to
 52 | numbers.
 53 | 
 54 |     (defclass ngrams ()
 55 |       ((order :initarg :order :reader ngrams-order)
 56 |        (count :reader ngrams-count)
 57 |        (max-freq :reader ngrams-max-freq)
 58 |        (min-freq :reader ngrams-min-freq)
 59 |        (total-freq :reader ngrams-total-freq)))
 60 | 
 61 | The crucial parameter of this class is `order` which defines the
 62 | length of a sequence. In practice, ngrams of order from 1 to 5 may be
 63 | used.
 64 | 
 65 | `ngrams` is an abstract class. In Lisp you don't have to somehow
 66 | specify this property, you just don't implement methods for it. The
 67 | simplest `ngrams` implementation — `table-ngrams` — uses an in-memory
 68 | hash-table as a store. You can get ngram frequency and "probability"
 69 | (the maximum likelihood estimation) from it, as well as log of
 70 | probability which is used more often in calculations, because it
 71 | allows to avoid the problem of floating point rounding errors
 72 | occurring when multiplying probabilities which are rather small
 73 | values.
 74 | 
 75 |     NLTK> (freq (text-bigrams *moby*) "The whale")
 76 |     Indexing bigrams...
 77 |     Number of bigrams: 116727
 78 |     14
 79 |     NLTK> (logprob (text-bigrams *moby*) "The whale")
 80 |     -14.255587
 81 | 
 82 | So how do we get bigrams of Moby Dick? For that we just have to count
 83 | all of them in text (this is a simplified version — some additional
 84 | processing for sentence start/ends is needed):
 85 | 
 86 |     (defun index-ngrams (order words &key ignore-case)
 87 |       (make 'table-ngrams :order order
 88 |             :table
 89 |             (let ((ht (make-hash-table :test (if ignore-case 'equalp 'equal))))
 90 |               (do ((tail words (rest tail)))
 91 |                    ((shorter? tail order))
 92 |                 (incf (get# (if (= order 1)
 93 |                                 (car tail)
 94 |                                 (sub tail 0 order))
 95 |                             ht 0)))
 96 |               ht)))
 97 | 
 98 | `table-ngrams` will be useful for simple experimentation and prototyping,
 99 | like we do in our NLTK examples.
100 | 
101 |     NLTK> (defvar *1grams* (text-ugrams *moby*))
102 |     Indexing unigrams...
103 |     Number of unigrams: 19244
104 |     NLTK> (freq *1grams* "whale")
105 |     906
106 |     NLTK> (take 50 (vocab *1grams* :order-by '>))
107 |     ("," "the" "<S>" "</S>" "." "of" "and" "-" "a" "to" ";" "in" "\"" "that" "'"
108 |  "his" "it" "I" "!" "s" "is" "he" "with" "was" "as" "all" "for" "this" "at"
109 |  "by" "but" "not" "him" "from" "be" "on" "?" "so" "whale" "one" "you" "had"
110 |  "have" "there" "But" "or" "were" "now" "which" "me")
111 | 
112 | The strings "<S>" and "</S>" here denote special symbols for sentence
113 | start and end.
114 | 
115 | Here's a cumulative plot of them:
116 | 
117 | ![Cumulative Frequency Plot for 50 Most Frequent Words in Moby Dick](http://img.photobucket.com/albums/v473/pufpuf/ccounts_zpsbc41c690.png)
118 | 
119 | And here's just the counts graph:
120 | 
121 | ![Frequency Plot for 50 Most Frequent Words in Moby Dick](http://img.photobucket.com/albums/v473/pufpuf/counts_zpsa3d96079.png)
122 | 
123 | And, finally, here's hapaxes:
124 | 
125 |     NLTK> (take 50 (hapaxes (text-ugrams *moby*)))
126 |     ("orphan" "retracing" "sheathed" "padlocks" "dirgelike" "Buoyed" "liberated"
127 |  "Till" "Ixion" "closing" "suction" "halfspent" "THEE" "ESCAPED" "ONLY"
128 |  "Epilogue" "thrill" "etherial" "intercept" "incommoding" "tauntingly"
129 |  "backwardly" "coincidings" "ironical" "intermixingly" "whelmings" "inanimate"
130 |  "animate" "lookouts" "infatuation" "Morgana" "Fata" "gaseous" "mediums"
131 |  "bewildering" "bowstring" "mutes" "voicelessly" "THUS" "grapple"
132 |  "unconquering" "comber" "foregone" "bullied" "uncracked" "unsurrendered"
133 |  "Diving" "flume" "dislodged" "buttress")
134 | 
135 | The next Python feature showcased here is __list comprehensions__.
136 | The idea behind them is to resemble theoretical-set notation in list
137 | definition. There's no such thing out-of-the box in Lisp (although you
138 | can implement an even closer to set-notation variant in
139 | [just 24 lines](http://lisp-univ-etc.blogspot.com/2013/01/real-list-comprehensions-in-lisp.html)),
140 | and the general approach is to favor functional style filtering with
141 | variants of `map` and `remove-if`.
142 | 
143 |     NLTK> (sort (remove-if #`(< (length %) 15)
144 |                            (uniq (text-words *moby*)))
145 |                 'string<)
146 |     ("CIRCUMNAVIGATION" "Physiognomically" "apprehensiveness" "cannibalistically" "characteristically" "circumnavigating" "circumnavigation" "circumnavigations" "comprehensiveness" "hermaphroditical" "indiscriminately" "indispensableness" "irresistibleness" "physiognomically" "preternaturalness" "responsibilities" "simultaneousness" "subterraneousness" "supernaturalness" "superstitiousness" "uncomfortableness" "uncompromisedness" "undiscriminating" "uninterpenetratingly")
147 |     NLTK> (sort (remove-if #`(or (<= (length %) 7)
148 |                                  (<= (freq (text-ugrams *chat*) %) 7))
149 |                        (vocab (text-ugrams *chat*)))
150 |             'string<)
151 |     ("20sUser104" <... another 130 users ...> "Question" "actually" "anything" "computer" "everyone" "football" "innocent" "listening" "remember" "seriously" "something" "talkcity_adults" "thinking" "together" "watching")
152 | 
153 | In NLTK variant all users are removed from the corpus with some pre-processing.
154 | 
155 | ## Language Modeling
156 | 
157 | But to be useful for real-world scenarios ngrams have to be large,
158 | really large (on the orders of tens of gigabytes of data for
159 | trigrams). This means that you won't be able to simply store them in
160 | memory and will have to use some external storage: a general-purpose
161 | data-store, like the relational database or a special-purpose
162 | software.
163 | 
164 | One such ngrams service that is available on the internet is
165 | [Microsoft Web N-gram Services](http://web-ngram.research.microsoft.com/).
166 | If you have a developer token you can query it over HTTP. The service
167 | only returns log-probabilities and also log-conditional-probabilities
168 | and runs really slow, but it is capable of serving batch requests,
169 | i.e. return probabilities for several ngrams at once. The
170 | implementation of `ngrams` interface for such service is provided in
171 | [contrib/ms-ngrams.lisp](https://github.com/vseloved/cl-nlp/blob/master/src/contrib/ms-ngrams.lisp).
172 | 
173 | We have already encountered conditional probabilities in the previous
174 | part. They have the following relationship with regular (so called,
175 | "joint") probabilities (for bigrams):
176 | 
177 |     p(A,B) = p(B|A) * p(A)
178 |     where P(A,B) is a joint probability and P(B|A) is the conditional one
179 | 
180 | I.e. they can be calculated from current ngrams plus the ngrams of
181 | preceding order. So, this operation is performed not on a single
182 | `ngrams` object, but on a pair of such objects. And they serve an
183 | important role we'll see below. But first we need to talk about
184 | language models.
185 | 
186 | A __language model__ is, basically, a collection of ngrams of
187 | different orders. Combining these ngrams we're able to obtain some
188 | other measures beyond a simple frequency value or probability
189 | estimate. The biggest added value of such model is in smoothing
190 | capabilities that it implements. The problem smoothing solves is that
191 | you'll almost never be able to have all possible ngrams in your
192 | data-store — there's just too many of them and the language users keep
193 | adding more. But it's very nasty to get 0 probability for some
194 | ngram. The language model allows to find a balance between the number
195 | of ngrams you have to store and the possibility to get meaningful
196 | probability numbers for any ngram. This is achieved with various
197 | smoothing techniques: interpolation and discounting. Some of the
198 | smoothing methods are:
199 | 
200 | - +1 smoothing
201 | - Kneser-Ney smoothing
202 | - and Stupid backoff
203 | 
204 | A good general compilation of various smoothing methods is assembled in
205 | [this presentation](http://courses.washington.edu/ling570/fei_fall09/10_26_Smoothing.pdf).
206 | 
207 | Let's look at the simplified implementation of scoring a sentence
208 | with the Stupid Backoff model:
209 | 
210 |     (defmethod logprob ((model language-model) (sentence list))
211 |       (with-slots (order) model
212 |         (let ((rez 0)
213 |               (s (append (cons "<S>" sentence) (list "</S>"))))
214 |           (when (shorter? s order)
215 |             (return-from logprob (logprob (get-ngrams (length s) model) s)))
216 |           ;; start of the sentence: p(A|<S>) * p(B|<S>,A) * ...
217 |           (do ((i 2 (1+ i)))
218 |                ((= i order))
219 |             (incf rez (cond-logprob model (sub s 0 i))))
220 |           ;; middle of the sentence
221 |           (do ((tail s (rest tail)))
222 |                ((shorter? tail order))
223 |             (incf rez (cond-logprob model (sub tail 0 order))))
224 |           rez)))
225 | 
226 | Eventually, the language model is able to return the estimated
227 | probability of any sequence of words, not limited to the maximum order
228 | of ngram in it. This is usually calculated using the Markov assumption
229 | with the following formula (for a bigram language model):
230 | 
231 |     p(s) = p(A) * p(B|A) * p(C|A,B) * p(D|B,C) ... * p(Z|X,Y)
232 |     where s = A B ... Z
233 | 
234 |     NLTK> (defvar *moby-lm2*
235 |             (make-lm 'stupid-backoff-lm
236 |                      :1g (text-ugrams *moby*)
237 |                      :2g (text-bigrams *moby*)))
238 |     NLTK> (prob *moby-lm2* "This is a test sentence.")
239 |     6.139835e-20
240 | 
241 | That was, by the way, the probability of an unseen sentence with the
242 | word "sentence" completely missing from vocabulary.
243 | 
244 |     NLTK> (prob *moby-lm2* '("<S>" "Moby" "Dick" "." "</S>"))
245 |     5.084481e-9
246 |     NLTK> (float (prob (text-bigrams *moby*) '("Moby" "Dick")))
247 |     3.0310333e-4
248 | 
249 | As you see, it's much more likely to encounter the sentence "Moby Dick."
250 | in this text, although not so likely as the phrase "Moby Dick". :)
251 | 
252 | Also such model is able to generate random texts just like we did in
253 | the previous part. But because of the smoothing capability it's much
254 | more general, i.e. it can generate sequences with any word from the
255 | vocabulary, even the phrases unseen before. At the same time it's much
256 | more computationally expensive, because now generating each new word
257 | takes `O(vocabulary size)` while it was `O(average number of words
258 | following any particular word)`.
259 | 
260 |     NLTK> (princ (generate *genesis* :order 2 :n 93))
261 |     burial to judged eaten sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung foreign longed them ought up Temanites Aran earth earth blessings surface surface surface surface surface surface surface surface surface floated Darkness Now homage earth Now In princes said vengeance It passed said divide In beginning earth Asenath said re The peaceful kind Calah said blameless mistress Chaldees said hunter said middle surface surface surface surface yonder earth rib said said smoking smoking smoking
262 | 
263 | And, as you see, this example totally doesn't resemble the one in the
264 | previous part. Is this a bug? No, just a trick that is played with us
265 | because we aren't following the basic math principles.  In the Stupid
266 | Backoff model the probabilities don't add up to 1 and the conditional
267 | probability of an unseen ngrams may be larger than the largest
268 | probability of any recorded one! This is the reason we get to produce
269 | sequences of repeated words. This problem is much less obvious for the
270 | trigram model, although the text remains a complete gibberish.
271 | 
272 |      NLTK> (princ (generate *genesis* :order 3 :n 93))
273 |      brink time wagons fourth Besides south darkness listen foreigner Stay possessor lentils backwards be call dignity Kenizzites tar witness strained Yes appear colts bodies Reuel burn inheritance Galeed Hadar money touches conceal mighty foreigner spices Set pit straw son hurry yoke numbered gutters Dedan honest drove Magdiel Nod life assembly your Massa iniquity Tola still fifteen ascending wilderness everywhere shepherd harm bore Elah Jebusites Assyria butler Euphrates sinners gave Nephilim Stay garments find lifted communing closed Ir lights doing weeping shortly disobedience possessions drank peoples fifteen bless talked songs lamb far Shaveh heavens
274 | 
275 | What this example shows us are at least two things:
276 | 
277 | - we should always check that mathematical properties of our models
278 |   still hold as we tweak them
279 | - although the major use-case for language model is scoring, you can
280 |   get a feel of how good it will perform by looking at the texts it
281 |   generates
282 | 
283 | ## Finding collocations
284 | 
285 | This is another interesting and useful NLP problem with a very elegant
286 | baseline solution, which is explained in this
287 | [article](http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html).
288 | Hopefully, we'll get back to it in more detail in the future chapters.
289 | And for now here's the results of implementing the algorithm from the article:
290 | 
291 |     NLTK> (collocations *inaugural*)
292 |     (("United" "States") ("fellow" "citizens") ("four" "years") ("years" "ago")
293 |      ("Federal" "Government") ("General" "Government") ("American" "people")
294 |      ("Vice" "President") ("Old" "World") ("Almighty" "God") ("Fellow" "citizens")
295 |      ("Chief" "Magistrate") ("Chief" "Justice") ("God" "bless") ("go" "forward")
296 |      ("every" "citizen") ("Indian" "tribes") ("public" "debt") ("one" "another")
297 |      ("foreign" "nations") ("political" "parties") ("State" "governments")
298 |      ("National" "Government") ("United" "Nations") ("public" "money")
299 |      ("national" "life") ("beloved" "country") ("upon" "us") ("fellow" "Americans")
300 |      ("Western" "Hemisphere"))
301 | 
302 | I'm surprised at how similar they are to NLTK's considering that I
303 | didn't look at their implementation. In fact, they are the same up to
304 | the difference in the list of __stopwords__ (the dirty secret of every
305 | NLP application :) The code for collocation extraction function can be
306 | found in [core/measures.lisp](https://github.com/vseloved/cl-nlp/blob/master/src/core/measures.lisp).
307 | 
308 | ## Other uses of ngrams
309 | 
310 | Ngrams are also sometimes used for individual characters to build
311 | Character language models. And here's another usage from NLTK — for
312 | counting word lengths.
313 | 
314 |     NLTK> (defvar *moby-lengths*
315 |                   (index-ngrams 1 (mapcar #'length (text-words *moby*))))
316 |     NLTK> (vocab *moby-lengths*)
317 |     (1 4 2 6 8 9 11 5 7 3 10 12 13 14 16 15 17 18 20)
318 |     NLTK> (ngrams-pairs *moby-lengths*)
319 |     ((1 . 58368) (4 . 42273) (2 . 35726) (6 . 17111) (8 . 9966) (9 . 6428)
320 |      (11 . 1873) (5 . 26595) (7 . 14399) (3 . 49633) (10 . 3528) (12 . 1053)
321 |      (13 . 567) (14 . 177) (16 . 22) (15 . 70) (17 . 12) (18 . 1) (20 . 1))
322 |     NLTK> (ngrams-max-freq *moby-lengths*)
323 |     58368
324 |     NLTK> (freq *moby-lengths* 3)
325 |     49633
326 | 
327 | ## Final thoughts
328 | 
329 | Language modeling is really the foundation of any serious NLP
330 | work. Having access to ngrams expands your possibilities immensely,
331 | but the problem with them is that moving from prototype to production
332 | implementation becomes tricky due to the problems of collecting a
333 | representative data-set and consequently efficiently storing it.
334 | Yet, there are solutions: the [Google Books Ngrams](http://storage.googleapis.com/books/ngrams/books/datasetsv2.html)
335 | and [Google Web1T](http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html)
336 | are an example of web-scale ngrams data-set, and there's also
337 | special-purpose software for storing large ngrams corpora and
338 | obtaining language models from them. The notable examples are
339 | [BerkeleyLM](http://code.google.com/p/berkeleylm/) and
340 | [KenLM](http://kheafield.com/code/kenlm/).
341 | 


--------------------------------------------------------------------------------
/nltk/ch1-1.md:
--------------------------------------------------------------------------------
  1 | # NLTK - Computing with Language: Texts and Words
  2 | 
  3 | OK, let's get started with the NLTK book. Its first chapter tries to
  4 | impress the reader with how simple it is to accomplish some neat
  5 | things with texts using it. Actually, the underlying algorithms that
  6 | allow to achieve these results are mostly quite basic. We'll discuss
  7 | them in this post and the code for the first part of the chapter can
  8 | be found in [nltk/ch1-1.lisp](https://github.com/vseloved/cl-nlp/blob/master/nltk/ch1-1.lisp).
  9 | 
 10 | ## Setting up texts for processing
 11 | 
 12 | For the purpose of this demonstration we'll need several texts which
 13 | can be downloaded from [NLTK data](http://nltk.org/nltk_data/).
 14 | Namely, we'll use the following 5 texts:
 15 | 
 16 | - Moby Dick (can be found inside Project Gutenberg)
 17 | - Sense and Sensibility (likewise from Project Gutenberg)
 18 | - The Book of Genesis
 19 | - Inaugural Address Corpus (this one comes as a collection of separate
 20 |   texts, that you'll need to cat together into one file)
 21 | - NPS Chat Corpus
 22 | 
 23 | These texts are in `nltk/data/` directory in `CL-NLP`.
 24 | 
 25 | NLTK guys have created a special `Text` class and have defined all the
 26 | operations in this chapter as its methods. We'll employ a slightly
 27 | simpler approach and implement them as ordinary functions. Yet we'll
 28 | also have a special-purpose `text` class to cache reusable results of
 29 | long-running operations, like tokenization.
 30 | 
 31 |     NLTK> (load-nltk-texts "data/")
 32 |     #<MOBY [Moby Dick by Herman... 1242986>
 33 |     #<SENSE [Sense and Sensibili... 673019>
 34 |     #<GENESIS In the beginning God... 188665>
 35 |     ...
 36 | 
 37 | As you've already guessed, we've just loaded all the texts.
 38 | The number in the last column is each text's character count.
 39 | 
 40 | Now they are stored in `*texts*` hash-table.
 41 | This is how we can access an individual text and name them for future usage:
 42 | 
 43 |     (defparameter *sense* (get# :sense *texts*))
 44 | 
 45 | (`get#` is one of the shorthand functions for operating on hash-tables
 46 | defined in [rutils](https://github.com/vseloved/rutils/blob/master/core/packages.lisp))
 47 | 
 48 | Now we have a variable pointing to "Sense and Sensibility".
 49 | If we examine it, this is what we'll see:
 50 | 
 51 |     NLTK> (describe *sense*)
 52 |     #<SENSE [Sense and Sensibili... 673019>
 53 |       [standard-object]
 54 |     Slots with :INSTANCE allocation:
 55 |       NAME         = :SENSE
 56 |       RAW          = "[Sense and Sensibility by Jane Austen 1811]..
 57 |       WORDS        = #<unbound slot>
 58 |       CTXS         = #<unbound slot>
 59 |       TRANSITIONS  = #<unbound slot>
 60 |       DISPERSION   = #<unbound slot>
 61 | 
 62 | As you see, there are some unbound slots in this structure: `words`
 63 | will hold every word in the text after tokenization, `ctxs` will be a
 64 | table of contexts for each word with their probabilities. By analogy,
 65 | `transitons` will be a table of transition probabilities between
 66 | words. Finally, `dispersion` will be a table of indices of word
 67 | occurences in text. We'll use a lazy initialization strategy for them
 68 | by defining `slot-unbound` CLOS methods, that will be called on first
 69 | access to each slot. For example, here's how `words` is initialized:
 70 | 
 71 |     (defmethod slot-unbound (class (obj text) (slot (eql 'words)))
 72 |       (with-slots (raw words) obj
 73 |         (format t "~&Tokenizing text...~%")
 74 |         (prog1 (setf words (mapcan #`(cons "¶" (tokenize <word-tokenizer> %))
 75 |                                    (tokenize <paragraph-splitter> raw)))
 76 |           (format t "Number of words: ~A~%" (length words)))))
 77 | 
 78 | First we split the raw text in paragraphs, because we'd like to
 79 | preserve paragraph information. Splitting is slightly involved as
 80 | paragraphs are separated by double newlines, while single newlines end
 81 | every line in the text, and we have to distinguish this. We insert
 82 | pillcrow signs paragraph boundaries. Then we tokenize the paragraphs
 83 | into separate words (real words, punctuation marks, symbols, etc).
 84 | 
 85 | NB. I consider tokenization the crucial function of the NLP toolkit,
 86 | and we'll explore it in more detail in one of the future posts.
 87 | 
 88 | 
 89 | ## Implementing the examples
 90 | 
 91 | OK, now we are ready to start churning out examples from the first
 92 | chapter.
 93 | 
 94 | The first one finds occurences of certain words in the text. NLTK guys
 95 | perform the search on the tokenized texts. But I think, it's quite OK
 96 | to do it on raw strings with regexes. This has an added benefit of
 97 | preserving text structure.
 98 | 
 99 |     NLTK> (concordance *moby* "monstrous")
100 |     Displaying 11 of 11 matches
101 |         former, one was of a most monstrous size. ...  This came towards
102 |                    "Touching that monstrous bulk of the whale or ork we h
103 |                          array of monstrous clubs and spears.  Some were
104 |      you gazed, and wondered what monstrous
105 |      has survived the flood; most monstrous
106 |                                   monstrous fable, or still worse and mor
107 |                            Of the Monstrous Pictures of Whales.
108 |             In connexion with the monstrous pictures of whales, I am stro
109 |     o enter upon those still more monstrous stories of them
110 |     ave been rummaged out of this monstrous
111 |      Whale-Bones; for Whales of a monstrous size are
112 | 
113 | With `:pass-newlines` on we can get the output similar to
114 | NLTK's. Let's try one of the homework tasks:
115 | 
116 |     NLTK> (concordance *genesis* "lived" :pass-newlines t)
117 |     Displaying 75 of 75 matches
118 |     t from Yahweh's presence, and lived in the land of Nod, east of
119 |      when they were created. Adam lived one hundred thirty years, and
120 |     ...
121 | 
122 | Now let's try similarity. Here we won't do without proper tokenization.
123 | 
124 |     NLTK> (similar *moby* "monstrous")
125 |     Building word contexts...
126 |     Tokenizing text...
127 |     Number of words: 267803
128 |     Number of unique words: 19243
129 |     ("mystifying" "subtly" "maddens" "impalpable" "modifies" "vexatious" "candid"
130 |      "exasperate" "doleful" "delightfully" "trustworthy" "domineering" "abundant"
131 |      "puzzled" "untoward" "contemptible" "gamesome" "reliable" "mouldy"
132 |      "determined")
133 | 
134 |     NLTK> (similar *sense* "monstrous")
135 |     Building word contexts...
136 |     Tokenizing text...
137 |     Number of words: 146926
138 |     Number of unique words: 6783
139 |     ("amazingly" "vast" "heartily" "extremely" "remarkably" "great" "exceedingly"
140 |      "sweet" "very" "so" "good" "a" "as")
141 | 
142 | We mostly get the same words as NLTK's result, but with a slightly
143 | different ordering. It turns out, that the reason for this is very
144 | simple. The function `similar` matches words based on the contexts,
145 | where they occur. According to the famous quote by John Rupert Firth:
146 | 
147 | > You shall know a word by the company it keeps
148 | 
149 | But if we look at context overlap between various words from our list
150 | we'll see that the similarity relation between all these words is
151 | extremely weak: the decision is based on the match of a single context
152 | in which both words appeared in text. In fact, all the listed words
153 | are similar to the same extent.
154 | 
155 |     NLTK> (common-contexts *moby* "monstrous" "loving")
156 |     ("most_and")
157 |     NLTK> (common-contexts *moby* "monstrous" "mystifying")
158 |     ("most_and")
159 |     NLTK> (apply #'common-contexts *moby* (similar *moby* "monstrous"))
160 |     ("most_and")
161 | 
162 | Actually, the next NLTK example is, probably, the best context overlap
163 | you can get from those texts:
164 | 
165 |     NLTK> (common-contexts *sense* "monstrous" "very")
166 |     ("am_glad" "is_pretty" "a_pretty" "a_lucky")
167 | 
168 | Now let's draw a dispersion plot of the words from inaugural corpus.
169 | This task may seem difficult to approach at first, because the authors
170 | use a Python library `matplotlib` for drawing the graph. Fortunately,
171 | there's a language-agnostic tool to achieve similar goals, which is
172 | called `gnuplot`. There is a couple of Lisp wrapper libraries for it,
173 | and the actual code you need to write to drive it amounts to 2 lines
174 | (not counting the code to format the data for consumption). There are,
175 | actually, numerous language-agnostic tools on the Unix platform —
176 | don't forget to look for them when you have such kind of specific need :)
177 | 
178 | ![gnuplot dispersion graph with cl-nlp](http://img.photobucket.com/albums/v473/pufpuf/dispersion_zps7447117e.jpg)
179 | 
180 | The next problem in this part also seems pretty hard. And, in fact, it
181 | is extremely hard if framed correctly — to generate a meaningful text
182 | based on some other text. But the example solves an easier task to
183 | generate _a somewhat_ meaningful text. And the approach taken to
184 | solve it is a very simple one — it is the baseline method in this area
185 | and is based on Markov chains. There was even a famous mock with
186 | Markov chains in the times of Usenet called [Mark V. Shaney](http://en.wikipedia.org/wiki/Mark_V_Shaney).
187 | Markov models have one principal parameter — _order_.
188 | Mark V. Shaney was an order 2 chain.
189 | 
190 | Let's try to generate something with it:
191 | 
192 |     NLTK> (princ (generate *genesis* :order 2 :n 93))
193 |     Building prefix transitions index of length 2...
194 |     Number of prefixes: 14326
195 |     In the marsh grass . Behold , the male and female . Of these , and that Jacob obeyed his father . I will return to the earth was dry . God set them in the earth . Noah built an altar there , and put me in the pit ; and that which was our father ' s hand , then the handmaids and their sin is very grievous in your flesh for an everlasting covenant between God and every man to see the nakedness of his flock and keep it .
196 | 
197 | And what if we raise the order?
198 | 
199 |     NLTK> (princ (generate *genesis* :order 3 :n 93))
200 |     Building prefix transitions index of length 3...
201 |     Number of prefixes: 28206
202 |     In the beginning God created the large sea creatures , and every bird , whatever moves on the earth . He stayed yet another seven days , and sent over that which he had gained in Paddan Aram . Esau saw that the interpretation was good , he said to them , they conspired against him to kill him . They took captive all their little ones , and for days and years ; and let it divide the waters from the waters . God said to the younger , and his seed
203 | 
204 | 
205 | The text starts to resemble the original more and more. Also you may
206 | notice, that the text will always start with "In". That's because
207 | Genesis isn't split in paragraphs, and our generation starts from
208 | paragraph beginnings, of which there's only one here.
209 | 
210 | OK, this seems to work, but with probabilities you never know for sure... ;)
211 | 
212 | Now, we're left with very simple tasks. Let's just do them:
213 | 
214 |     NLTK> (length (text-words *genesis*))
215 |     44671
216 | 
217 | In the book they had a slightly different number: 44764. This is
218 | because of the different tokenization scheme. The differences can be
219 | seen in the next snippet (we have a cleaner version for this use case :)
220 | 
221 |     NLTK> (take 20 (sort (remove-duplicates (text-words *genesis*) :test 'string=) 'string<))
222 |     ("!" "\"" "'" "(" ")" "," "-" "." ":" ";" "?" "A" "Abel" "Abida" "Abimael"
223 |  "Abimelech" "About" "Abraham" "Abram" "Accad")
224 | 
225 | What about the vocabulary size? Well, once again very similar to the
226 | NLTK number (2789).
227 | 
228 |     NLTK> (length (remove-duplicates (text-words *genesis*) :test 'string=))
229 |     2634
230 | 
231 | Now, let's look for words:
232 | 
233 |     NLTK> (count "smote" (text-words *genesis*) :test 'string=)
234 |     0
235 | 
236 | Hmm... What about some other word?
237 | 
238 |     NLTK> (count "Abraham" (text-words *genesis*) :test 'string=)
239 |     134
240 | 
241 | This seems to work. What's the problem with `"smote"`? Turns out,
242 | there's no such word in the Genesis text: at least the examination of
243 | the text doesn't show any traces of it. Looks like we've found a bug
244 | in the book :)
245 | 
246 |     (defun percentage (count total)
247 |       (/ (* 100.0 count) total))
248 |     NLTK> (with-slots (words) *inaugural*
249 |             (percentage (count "a" words :test 'string=) (length words)))
250 |     1.4597242
251 | 
252 |     (defun lexical-diversity (text)
253 |       (with-slots (words) text
254 |         (/ (float (length words))
255 |            (length (remove-duplicates words :test 'string=)))))
256 |     NLTK> (lexical-diversity *genesis*)
257 |     16.959377
258 |     NLTK> (lexical-diversity *chat*)
259 |     6.9837084
260 | 
261 | Interestingly, the results for `*chat*` corpus differ from the NLTK
262 | ones, although they are calculated based on tokens, provided in the
263 | corpus and not extracted by our tokenization algorithms. This text is
264 | special, because it is extracted from the XML-structured document,
265 | which also contains the full tokenization.
266 | To use it we swap `words` in `*chat*` corpus:
267 | 
268 |     NLTK> (setf (text-words *chat*)
269 |                 (mapcar #'token-word
270 |                         (flatten (corpus-text-tokens ncorp:+nps-chat-corpus+))))
271 | 
272 | But first we need to get the corpus and extract the data from it —
273 | see `corpora/nps-chat.lisp` for details.
274 | 
275 | And, finally, we can examine the Brown Corpus.
276 | 
277 |            Genre        |  Tokens  |  Types  |  Lexical Diversity
278 |     --------------------+----------+---------+--------------------
279 |     PRESS-REPORTAGE     |  100554  |  14393  |    7.0
280 |     PRESS-EDITORIAL     |  61604   |  9889   |    6.2
281 |     PRESS-REVIEWS       |  40704   |  8625   |    4.7
282 |     RELIGION            |  39399   |  6372   |    6.2
283 |     SKILL-AND-HOBBIES   |  82345   |  11934  |    6.9
284 |     POPULAR-LORE        |  110299  |  14502  |    7.6
285 |     BELLES-LETTRES      |  173096  |  18420  |    9.4
286 |     MISCELLANEOUS-GOVER |  70117   |  8180   |    8.6
287 |     LEARNED             |  181888  |  16858  |   10.8
288 |     FICTION-GENERAL     |  68488   |  9301   |    7.4
289 |     FICTION-MYSTERY     |  57169   |  6981   |    8.2
290 |     FICTION-SCIENCE     |  14470   |  3232   |    4.5
291 |     FICTION-ADVENTURE   |  69342   |  8873   |    7.8
292 |     FICTION-ROMANCE     |  70022   |  8451   |    8.3
293 |     HUMOR               |  21695   |  5016   |    4.3
294 | 
295 | 
296 | OK, seems like we're done with this chapter. So far there was no
297 | rocket science involved, but it was interesting...
298 | 
299 | ## Implementation details
300 | 
301 | So, what are the interesting bits we haven't discussed?
302 | 
303 | First, let's look at a small optimization trick for calculating
304 | `lexical-diversity`. Our initial variant uses a library function
305 | `remove-duplicates` which is highly inefficient for this case.
306 | 
307 |     NLTK> (time (lexical-diversity *chat*))
308 |     Evaluation took:
309 |       9.898 seconds of real time
310 |       9.888618 seconds of total run time (9.888618 user, 0.000000 system)
311 |       99.91% CPU
312 |       23,687,560,947 processor cycles
313 |       229,392 bytes consed
314 | 
315 | What we'd like to do is something similar to the Python's version
316 | which puts everything in a set and calculates its size.
317 | A set is easily represented with a hash-table:
318 | 
319 |     (defun uniq (list &key raw case-insensitive)
320 |       "Return only unique elements from LIST either as a new list
321 |        or as hash-table if RAW is set. Can be CASE-INSENSITIVE."
322 |       (let ((uniqs (make-hash-table :test (if case-insensitive 'equalp 'equal))))
323 |         (dolist (elt list)
324 |           (set# elt uniqs t))
325 |         (if raw uniqs (ht-keys uniqs))))
326 | 
327 | Here's the time of the same calculation using `uniq`:
328 | 
329 |     NLTK> (time (lexical-diversity *chat*))
330 |     Evaluation took:
331 |       0.014 seconds of real time
332 |       0.012001 seconds of total run time (0.012001 user, 0.000000 system)
333 |       85.71% CPU
334 |       33,396,336 processor cycles
335 |       613,568 bytes consed
336 | 
337 | A 1000x speed increase!
338 | 
339 | Now, let's return to text generation. It is accomplished with the
340 | following loop (a simplified version):
341 | 
342 |     (loop :for i :from 1 :to length :do
343 |       (let ((r (random 1.0))
344 |             (total 0))
345 |         (dotable (word prob
346 |                   (or (get# prefix transitions)
347 |                       ;; no continuation - start anew
348 |                       (prog1 (get# (setf prefix initial-prefix) transitions)
349 |                         ;; add period unless one is already there
350 |                         (unless (every #'period-char-p (car rez))
351 |                           (push "." rez)
352 |                           (incf i)))))
353 |           (when (> (incf total prob) r)
354 |             (push word rez)
355 |             (setf prefix (cons word (butlast prefix)))
356 |             (return)))))
357 | 
358 | On each iteration it places all possible continuations of the current
359 | prefix on a segment from 0 to 1 and generates a random number that
360 | points to one of the variants. If there's no continuation it starts
361 | anew.
362 | 
363 | NLTK book, actually, uses a slightly more complicated model: first, it
364 | builds a probability distribution on top of the transition frequencies
365 | and then generated the text from the probabilities. As of now I don't
366 | see why this is needed and if it makes any difference in the results.
367 | 
368 | And, finally, here's how we draw the dispersion plot:
369 | 
370 |     (defun plot-dispersion (words file)
371 |       "Plot WORDS dispersion data from FILE."
372 |       (cgn:start-gnuplot)
373 |       (cgn:format-gnuplot "set title \"Lexical Dispersion Plot\"")
374 |       (cgn:format-gnuplot "plot \"~A\" using 1:2:yticlabels(3) title \"\"" file))
375 | 
376 | It's just 1 line of `gnuplot` code, actually, but we also need to
377 | prepare the data in a tab-separated text file:
378 | 
379 |     (defun dump-data (words dispersion-table)
380 |       "Dump data from DISPERSION-TABLE for WORDS into a temporary file
381 |        and return its name."
382 |       (let ((filename (fmt "/tmp/~A" (gensym))))
383 |         (with-out-file (out filename)
384 |           (format out "0~t0~t~%")
385 |           (doindex (i word words)
386 |             (dolist (idx (get# word dispersion-table))
387 |               (format out "~A~t~A~t~A~%" idx (1+ i) word)))
388 |           (format out "0~t~A~t~%" (1+ (length words))))
389 |         filename))
390 | 
391 | To wrap up, we've seen a demonstration of a lot of useful tools for
392 | text processing, and also discussed how they can be built. Among all
393 | of them I want to outline the utility of a seemingly simplistic
394 | `concordance` that is actually kind of a `grep` tool that is
395 | indispensable for any text exploration. I even used it a couple of
396 | times debugging issues in more complex functions from this pack.
397 | 


--------------------------------------------------------------------------------