├── .gitignore ├── nltk ├── graphs │ └── inaugural-dispersion.jpg ├── package.lisp ├── ch1-2.lisp ├── ch1-1.lisp ├── ch1-2.md └── ch1-1.md ├── data └── abbrevs-with-dot.txt ├── README.md ├── src ├── contrib │ ├── packages.lisp │ └── ms-ngrams.lisp ├── test-util.lisp ├── corpora │ ├── corpus.lisp │ ├── nps-chat.lisp │ └── brown.lisp ├── core │ ├── measures.lisp │ ├── language-models.lisp │ ├── indexing.lisp │ ├── tokenization.lisp │ └── ngrams.lisp ├── user.lisp ├── packages.lisp ├── generation │ └── markov-chain.lisp └── util.lisp ├── LICENSE ├── cl-nltk.asd └── cl-nlp.asd /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.fasl 3 | \#* 4 | .* 5 | !.gitignore 6 | -------------------------------------------------------------------------------- /nltk/graphs/inaugural-dispersion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/html/cl-nlp/master/nltk/graphs/inaugural-dispersion.jpg -------------------------------------------------------------------------------- /nltk/package.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (cl:defpackage #:nltk 4 | (:use #:common-lisp #:rutil #:nlp-user)) 5 | -------------------------------------------------------------------------------- /data/abbrevs-with-dot.txt: -------------------------------------------------------------------------------- 1 | e.g. 2 | i.e. 3 | Mr. 4 | Ms. 5 | Mrs. 6 | Dr. 7 | St. 8 | Rd. 9 | Ave. 10 | A.D. 11 | B.C. 12 | Mt. 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CL-NLP — Lisp NLP tool-set 2 | 3 | Caution: this is vaporware! Come back in 2014, unless you want to contribute... 4 | 5 | ...but if you want to contribute you're very welcome. 6 | Feel free to write to 7 | -------------------------------------------------------------------------------- /src/contrib/packages.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (cl:defpackage #:nlp.contib.ngrams 4 | (:use #:common-lisp #:rutil #:nlp) 5 | (:export #:ms-ngrams 6 | #:ms-ngrams-url 7 | #:ms-ngrams-user-token 8 | #:ms-ngrams-catalog)) -------------------------------------------------------------------------------- /src/test-util.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.test-util) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defun test-file (filename) 8 | (merge-pathnames (strcat "test/" filename) +project-root+)) 9 | 10 | 11 | (defun test-on-corpus (func corpus expected &key (test 'equal)) 12 | (let ((tp 0) 13 | (fp 0)) 14 | )) 15 | 16 | #+nil 17 | (test-on-corpus #`(tokenize (make 'regex-word-tokenizer) %) 18 | +brown-corpus+ 19 | (test-file "brown-tokenization.txt")) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013 Vsevolod Dyomkin 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /cl-nltk.asd: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:asdf) 4 | 5 | (defsystem #:cl-nltk 6 | :version "0.0.2" 7 | :description "Implementation of the examples from the NLTK book." 8 | :author "Vsevolod Dyomkin " 9 | :maintainer "Vsevolod Dyomkin " 10 | :license "Apache 2.0" 11 | :depends-on (#:rutils #:cl-ppcre #:cl-nlp #:cgn) 12 | :serial t 13 | :components 14 | ((:module #:nltk 15 | :serial t 16 | :components 17 | ((:file "package") 18 | (:file "ch1-1") 19 | (:file "ch1-2"))))) 20 | -------------------------------------------------------------------------------- /src/corpora/corpus.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.corpora) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defstruct corpus 8 | "A structure to cache a unilingual corpus in raw and processed forms." 9 | name 10 | lang 11 | raw-texts 12 | clean-texts 13 | text-tokens) 14 | 15 | (defstruct token 16 | "A corpus token with postition and possibly tag." 17 | beg 18 | end 19 | word 20 | tag) 21 | 22 | 23 | (defgeneric read-corpus (type file) 24 | (:documentation 25 | "Read corpus data of a certain TYPE (a keyword) from file. 26 | Returns as values: 27 | - raw text data 28 | - cleaned-up text 29 | - list of tokens from the text")) -------------------------------------------------------------------------------- /src/core/measures.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.core) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (declaim (inline log2)) 8 | (defun log2 (x) 9 | "Base 2 logarithm." 10 | (/ (log x) (log 2))) 11 | 12 | (defun entropy (samples &optional total) 13 | "Compute Shannon's entropy of SAMPLES list. 14 | To save on calculation a pre-calculated TOTAL can be provided." 15 | (unless total 16 | (setf total (reduce #'+ samples))) 17 | (reduce #'+ (mapcar #`(if (zerop %) 18 | 0 19 | (let ((r (/ % total))) 20 | (* r (log2 r)))) 21 | samples))) 22 | 23 | (defun log-likelihood-ratio (ab a~b ~ab ~a~b) 24 | "Calculate log-likelihood ratio between event A and B given 25 | probabilites of A and B occurring together and separately." 26 | (let ((total (+ ab a~b ~ab ~a~b))) 27 | (* 2 total (- (entropy (list ab a~b ~ab ~a~b) total) 28 | (entropy (list (+ ab a~b) (+ ~ab ~a~b)) total) 29 | (entropy (list (+ ab ~ab) (+ a~b ~a~b)) total))))) -------------------------------------------------------------------------------- /src/user.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp-user) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defun print-word-in-contexts (word string &key (width 25) pass-newlines limit) 8 | "Print all (or LIMIT) WORD occurances in STRING with the surrounding 9 | context up to WIDTH chars. If PASS-NEWLINES isn't set, the context 10 | will be shown up to the closest newline." 11 | (declare (type (integer 0) width)) 12 | (let* ((regex (re:create-scanner (fmt "\\b~A\\b" word) 13 | :case-insensitive-mode t)) 14 | (matches (re:all-matches regex string)) 15 | (len (/ (length matches) 2))) 16 | (format t "Displaying ~A of ~A matches~%" (if limit (min limit len) len) len) 17 | (loop :for (s e) :on matches :by #'cddr :do 18 | (let ((s- (- s width)) 19 | (e+ (+ e width))) 20 | (if pass-newlines 21 | (format t "~A~%" (substitute-if #\Space #'white-char-p 22 | (subseq string s- e+))) 23 | (let ((l-pos (max s- (1+ (position #\Newline string 24 | :end s :from-end t)))) 25 | (r-pos (min e+ (1- (position #\Newline string :start e))))) 26 | (format t "~@[~A~]~A~@[~A~]~%" 27 | (unless (= s- l-pos) (filler (- l-pos s-))) 28 | (subseq string l-pos r-pos) 29 | (unless (= e+ r-pos) (filler (- e+ r-pos)))))))))) 30 | -------------------------------------------------------------------------------- /cl-nlp.asd: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:asdf) 4 | 5 | (defsystem #:cl-nlp 6 | :version "0.0.3" 7 | :description "NLP toolkit for Common Lisp." 8 | :author "Vsevolod Dyomkin " 9 | :maintainer "Vsevolod Dyomkin " 10 | :license "Apache 2.0" 11 | :depends-on (#:rutils #:cl-fad #:cl-ppcre #:cxml) 12 | :serial t 13 | :components 14 | ((:module #:src 15 | :serial t 16 | :components 17 | ((:file "packages") 18 | (:file "util") 19 | (:module #:corpora 20 | :serial t 21 | :components 22 | ((:file "corpus") 23 | (:file "brown") 24 | (:file "nps-chat"))) 25 | (:file "test-util") 26 | (:module #:core 27 | :serial t 28 | :components 29 | ((:file "measures") 30 | (:file "tokenization") 31 | (:file "ngrams") 32 | (:file "language-models") 33 | (:file "indexing"))) 34 | (:module #:generation 35 | :serial t 36 | :components 37 | ((:file "markov-chain"))) 38 | (:file "user"))))) 39 | 40 | (defsystem #:cl-nlp.contib 41 | :version "0.0.1" 42 | :description "CL-NLP additional packages." 43 | :author "Vsevolod Dyomkin " 44 | :maintainer "Vsevolod Dyomkin " 45 | :license "Apache 2.0" 46 | :depends-on (#:cl-nlp #:drakma) 47 | :serial t 48 | :components 49 | ((:module #:src 50 | :components 51 | ((:module #:contrib 52 | :serial t 53 | :components 54 | ((:file "packages") 55 | (:file "ms-ngrams"))))))) -------------------------------------------------------------------------------- /src/corpora/nps-chat.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.corpora) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defmethod read-corpus-file ((type (eql :nps-chat)) file) 8 | "Read individual file from the NPS Chat Corpus." 9 | (cxml:parse-file file (make 'nps-chat-sax))) 10 | 11 | #+manually 12 | (defparameter +nps-chat-corpus+ 13 | (let ((corpus (make-corpus :name "NPS Chat Corpus" :lang :en-us))) 14 | (fad:walk-directory 15 | (merge-pathnames "corpora/nps_chat/" +project-root+) 16 | #`(when (string= "xml" (pathname-type %)) 17 | (mv-bind (text tokens) (read-corpus-file :nps-chat %) 18 | (push (mapcar #'car text) (corpus-raw-texts corpus)) 19 | (push tokens (corpus-text-tokens corpus))))) 20 | corpus) 21 | "NPS Chat Corpus, Release 1.0 (July 2008).") 22 | 23 | 24 | ;; sax parsing 25 | 26 | (defun attr (name attributes) 27 | (sax::standard-attribute-value 28 | (find name attributes :test 'string= 29 | :key #'sax::standard-attribute-local-name))) 30 | 31 | (defclass nps-chat-sax (sax:sax-parser-mixin) 32 | ((posts :initform nil) 33 | (tokens :initform nil))) 34 | 35 | (defmethod sax:start-element ((sax nps-chat-sax) 36 | namespace-uri local-name qname attributes) 37 | (with-slots (posts tokens) sax 38 | (case (mkeyw local-name) 39 | (:post (push (attr "class" attributes) posts)) 40 | (:terminals (push (list nil) tokens)) 41 | (:t (push (make-token :word (attr "word" attributes) 42 | :tag (mkeyw (attr "pos" attributes))) 43 | (car tokens)))))) 44 | 45 | (defmethod sax:characters ((sax nps-chat-sax) data) 46 | (with-slots (posts) sax 47 | (when (stringp (car posts)) 48 | (setf (car posts) (cons data (car posts)))))) 49 | 50 | (defmethod sax:end-element ((sax nps-chat-sax) namespace-uri local-name qname) 51 | ;; to avoid warnings 52 | ) 53 | 54 | (defmethod sax:end-document ((sax nps-chat-sax)) 55 | (values (reverse (slot-value sax 'posts)) 56 | (reverse (mapcar #`(rest (reverse %)) 57 | (slot-value sax 'tokens))))) -------------------------------------------------------------------------------- /src/contrib/ms-ngrams.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.contrib.ngrams) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | ;;; Microsoft ngrams 8 | 9 | (defclass ms-ngrams (ngrams) 10 | ((count :initform -1) ; special value to inidicate that we don't know it :) 11 | (url :initarg :url :accessor ms-ngrams-url 12 | :initform "http://web-ngram.research.microsoft.com/rest/lookup.svc") 13 | (user-token :initarg :user-token :accessor ms-ngrams-user-token) 14 | (catalog :initarg :catalog :initform "bing-body/apr10" 15 | :accessor ms-ngrams-catalog)) 16 | (:documentation 17 | "Frontend to Microsoft Ngrams service. 18 | See http://web-ngram.research.microsoft.com/info/")) 19 | 20 | (flet ((to-string (ngram) 21 | "If NGRAM is a list, convert it to string." 22 | (if (listp ngram) (strjoin " " ngram) ngram))) 23 | 24 | (macrolet ((query-ngrams (op) 25 | `(with-slots (url user-token catalog order) ms-ngrams 26 | (let ((*read-eval* nil)) 27 | (read-from-string 28 | (drakma:http-request 29 | (fmt "~A/~A/~A/~A?u=~A&p=~A" 30 | url catalog order ,op user-token 31 | (to-string ngram)))))))) 32 | 33 | (defmethod logprob ((ngrams ms-ngrams) ngram) 34 | (query-ngrams "jp")) 35 | 36 | (defmethod cond-logprob ((ngrams ms-ngrams) ngram) 37 | (query-ngrams "cp")) 38 | 39 | ) ; end of marolet 40 | 41 | (macrolet ((query-ngrams (op) 42 | `(with-slots (url user-token catalog order) ms-ngrams 43 | (let ((*read-eval* nil)) 44 | (mapcar #'read-from-string 45 | (split-sequence 46 | #/Newline (drakma:http-request 47 | (fmt "~A/~A/~A/~A?u=~A" 48 | url catalog order ,op user-token) 49 | :method :post 50 | :content (fmt "~{~A~%~}" 51 | (mapcar #'to-string 52 | ngrams-list))))))))) 53 | 54 | (defmethod logprobs ((ngrams ms-ngrams) &rest ngrams-list) 55 | (query-ngrams "jp")) 56 | 57 | (defmethod cond-logprobs ((ngrams ms-ngrams) &rest ngrams-list) 58 | (query-ngrams "cp")) 59 | 60 | ) ; end of marolet 61 | ) ; end of flet 62 | 63 | 64 | (defmethod ngrams-eq ((ngrams ms-ngrams)) 65 | #'equalp) -------------------------------------------------------------------------------- /nltk/ch1-2.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nltk) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | (defclass text () 7 | ((name :initarg :name) 8 | (raw :initarg :raw :accessor text-raw) 9 | (words :accessor text-words) 10 | (ctxs :accessor text-ctxs) 11 | (transitions :accessor text-transitions) 12 | (dispersion :accessor text-dispersion) 13 | (ugrams :accessor text-ugrams) 14 | (bigrams :accessor text-bigrams) 15 | (trigrams :accessor text-trigrams))) 16 | 17 | (defmethod slot-unbound (class (obj text) (slot (eql 'ugrams))) 18 | (with-slots (words ugrams) obj 19 | (format t "~&Indexing unigrams...~%") 20 | (prog1 (setf ugrams (index-ngrams 1 words)) 21 | (format t "Number of ugrams: ~A~%" (ngrams-count ugrams))))) 22 | 23 | (defmethod slot-unbound (class (obj text) (slot (eql 'bigrams))) 24 | (with-slots (words bigrams) obj 25 | (format t "~&Indexing bigrams...~%") 26 | (prog1 (setf bigrams (index-ngrams 2 words)) 27 | (format t "Number of bigrams: ~A~%" (ngrams-count bigrams))))) 28 | 29 | (defmethod slot-unbound (class (obj text) (slot (eql 'trigrams))) 30 | (with-slots (words trigrams) obj 31 | (format t "~&Indexing trigrams...~%") 32 | (prog1 (setf trigrams (index-ngrams 3 words)) 33 | (format t "Number of trigrams: ~A~%" (ngrams-count trigrams))))) 34 | 35 | (defun collocations (text) 36 | (find-collocations (text-bigrams text) :n 30)) 37 | 38 | (defun generate (text &key (n 20) (order 2)) 39 | "Generate random text of N words, based on TEXT." 40 | (with-slots (transitions) text 41 | (string-trim (append +white-chars+ +newline-chars+) 42 | (fmt "~{~A ~}" 43 | (generate-text (make 'markov-chain-generator :order order) 44 | (make-lm 'stupid-backoff-lm 45 | :1g (text-ugrams text) 46 | :2g (when (> order 1) 47 | (text-bigrams text)) 48 | :3g (when (> order 2) 49 | (text-trigrams text))) 50 | n))))) 51 | 52 | ;; Plotting 53 | 54 | (defun dump-counts (ngrams n order-by cumulative) 55 | "Dump N NGRAMS counts (or CUMULATIVE counts) orderd by ORDER-BY." 56 | (let ((filename (fmt "/tmp/~A" (gensym))) 57 | (total 0)) 58 | (with-out-file (out filename) 59 | (doindex (i pair (ngrams-pairs ngrams :order-by order-by)) 60 | (when (and n (> i n)) 61 | (return)) 62 | (format out "~A~t~S~t~A~%" (1+ i) (car pair) 63 | (if cumulative 64 | (incf total (cdr pair)) 65 | (cdr pair)))) 66 | filename))) 67 | 68 | (defun plot (ngrams &key n (order-by '>) cumulative) 69 | "Plot NGRAMS counts." 70 | (cgn:with-gnuplot (t) 71 | (cgn:format-gnuplot "set xtics rotate 90") 72 | (cgn:format-gnuplot "set ylabel \"~@[Cumulative ~]Counts\"" cumulative) 73 | (cgn:format-gnuplot 74 | "plot \"~A\" using 1:3:xtic(2) with lines title \"\"" 75 | (dump-cumulative-counts ngrams n order-by cumulative)))) -------------------------------------------------------------------------------- /src/corpora/brown.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.corpora) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defmethod read-corpus-file ((type (eql :brown)) file) 8 | "Read individual file from the Brown corpus." 9 | (let ((text (string-trim +white-chars+ (read-file file))) 10 | (offset 0) 11 | tokens) 12 | (loop :for (beg end) :on (re:all-matches "[^\\s]+" text) :by #'cddr :do 13 | (let* ((tok (subseq text beg end)) 14 | (/-pos (position #\/ tok :from-end t)) 15 | (word (subseq tok 0 /-pos)) 16 | (tag (subseq tok (1+ /-pos)))) 17 | (when (member word '("``" "''") :test #'string=) 18 | (setf word "\"" 19 | end (1- end))) 20 | (push (make-token :beg (- beg offset) 21 | :end (- end (incf offset (1+ (length tag)))) 22 | :word word :tag tag) 23 | tokens))) 24 | (values text 25 | (let ((clean-text (make-string (token-end (first tokens)) 26 | :initial-element #\Space))) 27 | (dolist (token (reverse tokens) clean-text) 28 | (with-slots (beg end word) token 29 | (setf (subseq clean-text beg end) word)))) 30 | (reverse tokens)))) 31 | 32 | #+manually 33 | (defparameter +brown-corpus+ 34 | (let ((brown-topics-mapping #{#\a :press-reportage 35 | #\b :press-editorial 36 | #\c :press-reviews 37 | #\d :religion 38 | #\e :skill-and-hobbies 39 | #\f :popular-lore 40 | #\g :belles-lettres 41 | #\h :miscellaneous-government-house-organs 42 | #\j :learned 43 | #\k :fiction-general 44 | #\l :fiction-mystery 45 | #\m :fiction-science 46 | #\n :fiction-adventure 47 | #\p :fiction-romance 48 | #\r :humor 49 | }) 50 | (raw-texts #{}) 51 | (clean-texts #{}) 52 | (text-tokens #{}) 53 | (corpus #{})) 54 | (dolist (path (directory (merge-pathnames "corpora/brown/*" +project-root+))) 55 | (when (= 4 (length (pathname-name path))) 56 | (let ((topic (get# (char (pathname-name path) 1) brown-topics-mapping))) 57 | (unless (get# topic raw-texts) 58 | (set# topic raw-texts '(nil)) 59 | (set# topic clean-texts '(nil)) 60 | (set# topic text-tokens '(nil))) 61 | (mv-bind (raw text tokens) (read-corpus-file :brown path) 62 | (push raw (get# topic raw-texts)) 63 | (push text (get# topic clean-texts)) 64 | (push tokens (get# topic text-tokens)))))) 65 | (dolist (topic (ht-keys raw-texts)) 66 | (set# topic corpus 67 | (make-corpus 68 | :name (fmt "Brown Corpus - ~A" topic) 69 | :lang :en-us 70 | :raw-texts (rest (reverse (get# topic raw-texts))) 71 | :clean-texts (rest (reverse (get# topic clean-texts))) 72 | :text-tokens (rest (reverse (get# topic text-tokens)))))) 73 | corpus) 74 | "Brown University Standard Corpus of Present-Day American English.") -------------------------------------------------------------------------------- /src/packages.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (cl:defpackage #:nlp.util 4 | (:nicknames #:nutil) 5 | (:use #:common-lisp #:rutil) 6 | (:export #:cl-nlp-error 7 | #:not-implemented-error 8 | 9 | #:+newline+ 10 | #:+newline-chars+ 11 | #:newline-char-p 12 | #:+white-chars+ 13 | #:white-char-p 14 | #:+period-chars+ 15 | #:period-char-p 16 | 17 | #:*stopwords-en* 18 | #:ending-word-p 19 | 20 | #:filler 21 | 22 | #:+project-root+ 23 | #:data-file 24 | #:list-from-file 25 | 26 | #:bin-search 27 | 28 | #:define-lazy-singleton 29 | #:sorted-ht-keys 30 | )) 31 | 32 | (cl:defpackage #:nlp.corpora 33 | (:nicknames #:ncorp) 34 | (:use #:common-lisp #:rutil #:nlp.util) 35 | (:export #:corpus 36 | #:make-corpus 37 | 38 | #:corpus-name 39 | #:corpus-lang 40 | #:corpus-raw-texts 41 | #:corpus-clean-texts 42 | #:corpus-text-tokens 43 | 44 | #:read-corpus 45 | #:read-corpus-file 46 | 47 | #:token 48 | #:token-word 49 | #:token-beg 50 | #:token-end 51 | #:token-tag 52 | 53 | #:+brown-corpus+ 54 | #:+nps-chat-corpus+ 55 | )) 56 | 57 | (cl:defpackage #:nlp.test-util 58 | (:nicknames #:ntest) 59 | (:use #:common-lisp #:rutil #:nlp.util #:nlp.corpora) 60 | (:export )) 61 | 62 | (cl:defpackage #:nlp.core 63 | (:nicknames #:ncore) 64 | (:use #:common-lisp #:rutil #:nlp.util) 65 | (:export #:ngrams 66 | #:ngrams-eq 67 | #:ngrams-order 68 | #:ngrams-count 69 | #:ngrams-total-freq 70 | #:ngrams-max-freq 71 | #:ngrams-min-freq 72 | #:ngrams-pairs 73 | #:vocab 74 | #:freq 75 | #:prob 76 | #:logprob 77 | #:cond-prob 78 | #:cond-logprob 79 | #:freqs 80 | #:probs 81 | #:logprobs 82 | #:cond-probs 83 | #:cond-logprobs 84 | #:top-ngram 85 | #:hapaxes 86 | #:table-ngrams 87 | #:ngrams-table 88 | 89 | #:language-model 90 | #:lm-order 91 | #:lm-ngrams 92 | #:make-lm 93 | #:perplexity 94 | #:stupid-backoff-lm 95 | #:lm-backoff 96 | 97 | #:index-ngrams 98 | #:index-context-freqs 99 | #:index-prefix-transition-freqs 100 | #:index-word-transition-freqs 101 | #:normalize-freqs 102 | 103 | #:tokenize 104 | ;; #:stream-tokenize 105 | #:tokenizer 106 | #:regex-word-tokenizer 107 | #:baseline-sentence-tokenizer 108 | #: 109 | #: 110 | #: 111 | 112 | #:doublenewline-paragraph-splitter 113 | #: 114 | 115 | #:find-collocations 116 | )) 117 | 118 | ;; (cl:defpackage #:nlp.phonetics 119 | ;; (:nicknames #:npho) 120 | ;; (:use #:common-lisp #:rutil) 121 | ;; (:export #:phonetic-transform 122 | ;; )) 123 | 124 | ;; (cl:defpackage #:nlp.syntax 125 | ;; (:nicknames #:nsyn) 126 | ;; (:use #:common-lisp #:rutil) 127 | ;; (:export #:pos-tag 128 | ;; #:parse 129 | ;; #:parse-deps 130 | ;; )) 131 | 132 | (cl:defpackage #:nlp.generation 133 | (:nicknames #:ngen) 134 | (:use #:common-lisp #:rutil #:nlp.util #:nlp.core) 135 | (:export #:generate-text 136 | 137 | #:text-generator 138 | 139 | #:markov-chain-generator 140 | #:mark-v-shaney-generator 141 | #: 142 | #:markov-order 143 | )) 144 | 145 | ;; (cl:defpackage #:nlp.learning 146 | ;; (:nicknames #:nlearn) 147 | ;; (:use #:common-lisp #:rutil) 148 | ;; (:export #:cluster 149 | ;; #:classify 150 | ;; #:train 151 | ;; )) 152 | 153 | 154 | (cl:defpackage #:nlp-user 155 | (:nicknames #:nlp) 156 | (:use #:common-lisp #:rutil 157 | #:nlp.util #:nlp.corpora #:nlp.core #:nlp.generation)) 158 | 159 | (re-export-symbols '#:nutil '#:nlp-user) 160 | (re-export-symbols '#:ncore '#:nlp-user) 161 | (re-export-symbols '#:ncorp '#:nlp-user) 162 | (re-export-symbols '#:ngen '#:nlp-user) -------------------------------------------------------------------------------- /src/generation/markov-chain.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.generation) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defgeneric generate-text (generator data length &key) 8 | (:documentation 9 | "Generate random text of LENGTH words based on some DATA 10 | (usually, table of transition probabilities between tokens). 11 | May not return period at the end. 12 | ")) 13 | 14 | 15 | (defclass text-generator () 16 | () 17 | (:documentation 18 | "Base class for text generators.")) 19 | 20 | (defclass markov-chain-generator (text-generator) 21 | ((order :accessor markov-order :initarg :order)) 22 | (:documentation 23 | "Markov chain generator of the given ORDER.")) 24 | 25 | (defclass mark-v-shaney-generator (markov-chain-generator) 26 | ((order :reader markov-order :initform 2)) 27 | (:documentation 28 | "Markov chain generator of the 1st order — it is defined, because: 29 | - this is the general and most useful case 30 | - it allows to use optimized data-structures and a simpler algorithm 31 | - the name is well-known")) 32 | 33 | (defmethod generate-text ((generator markov-chain-generator) 34 | (transitions hash-table) 35 | length &key skip-paragraphs &allow-other-keys) 36 | "Generate text of LENGTH with a markov model of some MARKOV-ORDER described 37 | by the table TRANSITIONS of transition probabilities between reverse prefixes 38 | of MARKOV-ORDER length and words. 39 | Unless SKIP-PARAGRAPHS is set, the text may include newlines." 40 | (let* ((order (markov-order generator)) 41 | (initial-prefix (if (> order 1) 42 | (cons "¶" (make-list (1- order))) 43 | (list "¶"))) 44 | (prefix initial-prefix) 45 | rez) 46 | (loop :for i :from 1 :to length :do 47 | (let ((r (random 1.0))) 48 | (dotable (word prob 49 | (or (get# prefix transitions) 50 | ;; no continuation - start anew 51 | (prog1 (get# (setf prefix initial-prefix) transitions) 52 | ;; add period unless one is already there 53 | (unless (every #'period-char-p (car rez)) 54 | (push "." rez) 55 | (incf i))))) 56 | (when (<= (decf r prob) 0) 57 | (if (string= "¶" word) 58 | (if skip-paragraphs 59 | (decf i) ; don't count newline if we don't include it 60 | (push +newline+ rez)) 61 | (push word rez)) 62 | (setf prefix (cons word (butlast prefix))) 63 | (return))))) 64 | (reverse rez))) 65 | 66 | (defmethod generate-text ((generator markov-chain-generator) 67 | (model language-model) 68 | length &key &allow-other-keys) 69 | "Generate text of LENGTH with a markov model of some MARKOV-ORDER 70 | with the given language MODEL. 71 | May not return period at the end." 72 | (assert (<= (markov-order generator) (lm-order model))) 73 | (let* ((order (markov-order generator)) 74 | (vocab (vocab model)) 75 | (len (length vocab)) 76 | (ngram (list "")) 77 | rez) 78 | (loop :for i :from 1 :to length :do 79 | (when (= (length ngram) order) 80 | (setf ngram (rest ngram))) 81 | (let ((total 0) 82 | (cond-probs (list (cons "" 0)))) 83 | (dolist (word vocab) 84 | (unless (string= "" word) 85 | (push (cons word 86 | (incf total (cond-prob model (append ngram 87 | (list word))))) 88 | cond-probs))) 89 | (let ((word (car (bin-search (random total) 90 | (make-array len 91 | :initial-contents cond-probs) 92 | #'> :key #'cdr)))) 93 | (if (string= "" word) 94 | (progn (if (every #'period-char-p (car rez)) 95 | (unless (= i length) 96 | (decf i)) ; just skip 97 | (push "." rez)) 98 | (setf ngram (list ""))) 99 | (setf rez (cons word rez) 100 | ngram (append ngram (list word))))))) 101 | (reverse rez))) 102 | 103 | 104 | (define-lazy-singleton mark-v-shaney (make 'markov-chain-generator :order 2) 105 | "The infamous Mark V. Shaney.") -------------------------------------------------------------------------------- /src/util.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.util) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | ;;; Some package renaming 8 | 9 | (rename-package "CL-PPCRE" "CL-PPCRE" '("PPCRE" "RE")) 10 | 11 | 12 | ;;; Conditions 13 | 14 | (define-condition nlp-error (simple-error) ()) 15 | 16 | (define-condition not-implemented-error (simple-error) ()) 17 | 18 | 19 | ;;; Working with chars 20 | 21 | (defparameter +newline+ 22 | (fmt "~%") 23 | "A string with a single newline.") 24 | 25 | (defparameter +white-chars+ 26 | '(#\Space #\Tab #\Newline #\Return #\Linefeed) 27 | "Chars considered WHITESPACE.") 28 | 29 | (defun white-char-p (char) 30 | "Test if CHAR is in +WHITE-CHARS+." 31 | (member char +white-chars+)) 32 | 33 | (defparameter +period-chars+ 34 | '(#\. #\? #\!) 35 | "Chars considered legitimate sentence endings.") 36 | 37 | (defun period-char-p (char) 38 | "Test if CHAR is in +PERIOD-CHARS+." 39 | (member char +period-chars+)) 40 | 41 | (defparameter +newline-chars+ 42 | '(#\Newline #\Return #\Linefeed) 43 | "Chars considered legitimate paragraph endings.") 44 | 45 | (defun newline-char-p (char) 46 | "Test if CHAR is in +PERIOD-CHARS+." 47 | (member char +newline-chars+)) 48 | 49 | (declaim (inline filler)) 50 | (defun filler (n &optional (fill-char #\Space)) 51 | "Produce an N-element filler string of FILL-CHAR's." 52 | (make-string n :initial-element fill-char)) 53 | 54 | 55 | ;;; Working wtih words 56 | 57 | (defparameter *stopwords-en* 58 | '("!" "\"" "'" "," "-" "." ":" ";" "" "" "?" "a" "about" "above" "after" "again" "against" "all" "am" "an" "and" "any" "are" "aren't" "as" "at" "be" "because" "been" "before" "being" "below" "between" "both" "but" "by" "can't" "cannot" "could" "couldn't" "d" "did" "didn't" "do" "does" "doesn't" "doing" "don't" "down" "during" "each" "few" "for" "from" "further" "had" "hadn't" "has" "hasn't" "have" "haven't" "having" "he" "he'd" "he'll" "he's" "her" "here" "here's" "hers" "herself" "him" "himself" "his" "how" "how's" "i" "i'd" "i'll" "i'm" "i've" "if" "in" "into" "is" "isn't" "it" "it's" "its" "itself" "let" "let's" "ll" "me" "more" "most" "mustn't" "my" "myself" "n't" "no" "nor" "not" "of" "off" "on" "once" "only" "or" "other" "ought" "our" "ours " "ourselves" "out" "over" "own" "s" "same" "shan't" "she" "she'd" "she'll" "she's" "should" "shouldn't" "so" "some" "such" "t" "than" "that" "that's" "the" "their" "theirs" "them" "themselves" "then" "there" "there's" "these" "they" "they'd" "they'll" "they're" "they've" "this" "those" "through" "to" "too" "under" "until" "up" "very" "was" "wasn't" "we" "we'd" "we'll" "we're" "we've" "were" "weren't" "what" "what's" "when" "when's" "where" "where's" "which" "while" "who" "who's" "whom" "why" "why's" "with" "won't" "would" "wouldn't" "you" "you'd" "you'll" "you're" "you've" "your" "yours" "yourself" "yourselves") 59 | "List of english stopwords.") 60 | 61 | (defun ending-word-p (word) 62 | "Check if string WORD is some kind of a period char or a paragraph mark." 63 | (or (every #'period-char-p word) 64 | (string= "¶" word))) 65 | 66 | 67 | ;;; Working with project files 68 | 69 | (eval-always 70 | (defparameter +project-root+ (asdf:system-relative-pathname 'cl-nlp "") 71 | "Base dir of cl-nlp project.")) 72 | 73 | (defun data-file (filename) 74 | "File in data/ subdir of cl-nlp." 75 | (merge-pathnames (strcat "data/" filename) 76 | +project-root+)) 77 | 78 | (defun list-from-file (file) 79 | "Load the contents of FILE into a list of strings for each trimmed line." 80 | (let (rez) 81 | (dolines (line file) 82 | (push (string-trim +white-chars+ line) rez)) 83 | (reverse rez))) 84 | 85 | 86 | ;;; Search 87 | 88 | (defun bin-search (val vec test-less &key (start 0) end key test) 89 | "Binary search for VAL in sorted vector VEC (the order property isn't checked). 90 | Needs to specify TEST-LESS predicate. Handles START, END, KEY as usual. 91 | It TEST is provided tests the value at the found position against VAL, 92 | and returns nil if it returns nil." 93 | (let ((low start) 94 | (high (or end (1- (length vec))))) 95 | (do () 96 | ((= low high) (when (or (null test) 97 | (funcall test val (svref vec high))) 98 | (elt vec high))) 99 | (let ((mid (floor (+ low high) 2))) 100 | (if (funcall test-less (if key 101 | (funcall key (svref vec mid)) 102 | (svref vec mid)) 103 | val) 104 | (setf low (1+ mid)) 105 | (setf high mid)))))) 106 | 107 | 108 | ;;; Misc 109 | 110 | (defmacro define-lazy-singleton (name init &optional docstring) 111 | "Define a function NAME, that will return a singleton object, 112 | initialized lazily with INIT on first call. 113 | Also define a symbol macro that will expand to (NAME)." 114 | (with-gensyms (singleton) 115 | `(let (,singleton) 116 | (defun ,name () 117 | ,docstring 118 | (or ,singleton 119 | (setf ,singleton ,init))) 120 | (define-symbol-macro ,(mksym name :format "<~A>") (,name))))) 121 | 122 | (defun sorted-ht-keys (test ht) 123 | "Return hash-table keys of HT in sorted order accroding to TEST." 124 | (sort (ht-keys ht) test :key #`(get# % ht))) 125 | 126 | (defun shorter? (list n) 127 | "Tests if LIST has at least N elements." 128 | (let ((tail list)) 129 | (loop :repeat (1- n) :do (setf tail (cdr tail))) 130 | (null tail))) 131 | 132 | -------------------------------------------------------------------------------- /src/core/language-models.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.core) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defclass language-model () 8 | ((order :initarg :order :reader lm-order) 9 | (ngrams :initarg :ngrams :reader lm-ngrams)) 10 | (:documentation 11 | "Language model is a collection of NGRAMS of all orders from 1 upto ORDER.")) 12 | 13 | (defmethod vocab ((model language-model) &key order-by) 14 | (vocab (get-ngrams 1 model) :order-by order-by)) 15 | 16 | (defgeneric make-lm (class &key 1g 2g 3g 4g 5g &allow-other-keys) 17 | (:documentation 18 | "Make instance of a langauge model of a certain CLASS 19 | with provided unigrams (1G), ... up to fivegrams (5G).")) 20 | 21 | (defmethod make-lm (class &key 1g 2g 3g 4g 5g &allow-other-keys) 22 | (let ((order (cond (5g 5) 23 | (4g 4) 24 | (3g 3) 25 | (2g 2) 26 | (1g 1) 27 | (t (error "No ngrams supplied"))))) 28 | (make class 29 | :order order 30 | :ngrams 31 | (make-array 32 | (1+ order) 33 | :initial-contents 34 | (case order 35 | (1 (list nil 36 | 1g)) 37 | (2 (list nil 38 | (or 1g (error "No unigrams supplied for LM of order 2")) 39 | 2g)) 40 | (3 (list nil 41 | (or 1g (error "No unigrams supplied for LM of order 3")) 42 | (or 2g (error "No bigrams supplied for LM of order 3")) 43 | 3g)) 44 | (4 (list nil 45 | (or 1g (error "No unigrams supplied for LM of order 4")) 46 | (or 2g (error "No bigrams supplied for LM of order 4")) 47 | (or 3g (error "No trigrams supplied for LM of order 4")) 48 | 4g)) 49 | (5 (list nil 50 | (or 1g (error "No unigrams supplied for LM of order 5")) 51 | (or 2g (error "No bigrams supplied for LM of order 5")) 52 | (or 3g (error "No trigrams supplied for LM of order 5")) 53 | (or 4g (error "No fourgrams supplied for LM of order 5")) 54 | 5g))))))) 55 | 56 | (defgeneric perplexity (model test-sentences) 57 | (:documentation 58 | "Calculate perplexity of the MODEL on the list of TEST-SENTENCES.")) 59 | 60 | (defmethod perplexity ((model language-model) test-sentences) 61 | (expt 2 (- (/ (reduce #'+ (mapcar #`(logprob ngrams %) test-sentences)) 62 | (reduce #'+ (mapcar #'length test-sentences)))))) 63 | 64 | 65 | ;;; Stupid Backoff LM 66 | 67 | (defclass stupid-backoff-lm (language-model) 68 | ((backoff :initarg backoff :initform 0.4 :reader lm-backoff)) 69 | (:documentation 70 | "Stupid Backoff language model.")) 71 | 72 | (defmethod prob ((lm language-model) (sentence string)) 73 | (prob lm (tokenize sentence))) 74 | 75 | (defmethod prob ((lm language-model) (sentence list)) 76 | (expt 2 (logprob lm sentence))) 77 | 78 | (defmethod logprob ((lm language-model) (sentence string)) 79 | (logprob lm (tokenize sentence))) 80 | 81 | (defmethod logprob ((model language-model) (sentence list)) 82 | (unless sentence (return-from logprob nil)) 83 | (with-slots (order) model 84 | (let ((rez 0)) 85 | (if (= 1 order) 86 | (dolist (word sentence rez) 87 | (incf rez (logprob (get-ngrams 1 model) word))) 88 | (let ((s (append (cons "" sentence) (list "")))) 89 | (if (shorter? s order) 90 | (logprob (get-ngrams (length s) model) s) 91 | (progn 92 | (do ((i 2 (1+ i))) 93 | ((= i order)) 94 | (incf rez (cond-logprob model (sub s 0 i)))) 95 | (do ((tail s (rest tail))) 96 | ((shorter? tail order)) 97 | (let ((ngram (sub tail 0 order))) 98 | (unless (search '("" "") ngram :test 'equal) 99 | (incf rez (cond-logprob model ngram))))) 100 | rez))))))) 101 | 102 | (defmethod cond-prob ((model stupid-backoff-lm) ngram) 103 | (with-accessors ((ngrams lm-ngrams) (backoff lm-backoff)) model 104 | (let ((coef 1) 105 | (len (length ngram))) 106 | (loop :for i :from len :downto 1 :do 107 | (let* ((cur (butlast ngram (- len i))) 108 | (freq (freq (elt ngrams i) 109 | (if (cdr cur) cur (car cur))))) 110 | (if (zerop freq) 111 | (setf coef (* coef backoff)) 112 | (return-from cond-prob 113 | (* coef (/ freq 114 | (case i 115 | (1 (ngrams-total-freq (elt ngrams 1))) 116 | (2 (freq (elt ngrams 1) (car ngram))) 117 | (otherwise 118 | (freq (elt ngrams (1- i)) (butlast ngram)))))))))) 119 | (* coef (/ (ngrams-min-freq (elt ngrams 1)) 120 | (ngrams-total-freq (elt ngrams 1))))))) 121 | 122 | (defmethod cond-logprob ((model stupid-backoff-lm) ngram) 123 | (log2 (cond-prob model ngram))) 124 | 125 | 126 | ;;; Helper functions 127 | 128 | (declaim (inline get-ngrams)) 129 | (defun get-ngrams (order model) 130 | "Get ngrams of a given ORDER from MODEL." 131 | (assert (<= order (lm-order model))) 132 | (elt (lm-ngrams model) order)) -------------------------------------------------------------------------------- /src/core/indexing.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.core) 4 | (named-readtables:in-readtable rutils-readtable) 5 | (declaim (optimize (compilation-speed 2) (speed 3) (space 2) (debug 1))) 6 | 7 | 8 | (defun index-ngrams (order words &key ignore-case) 9 | "Make and ngrams-table of ORDER from a list of WORDS. 10 | May IGNORE-CASE." 11 | (make 'table-ngrams :order order 12 | :table 13 | (let ((ht (make-hash-table :test (if ignore-case 'equalp 'equal) 14 | :rehash-size 10.0)) 15 | (last-idx (1- order))) 16 | (do ((tail words (rest tail))) 17 | ((shorter? tail order)) 18 | (let ((cur (car tail))) 19 | (if (= 1 order) 20 | (progn 21 | (when (stringp cur) 22 | (cond ((string= "¶" cur) 23 | (setf cur "")) 24 | ((and (every #'period-char-p cur) 25 | (or (null (rest tail)) 26 | (upper-case-p (char (second tail) 0)))) 27 | (setf tail (cons nil (cons "" (rest tail))))))) 28 | (incf (get# cur ht 0))) 29 | (incf (get# (let* ((ngram (sub tail 0 order)) 30 | (suffix (rest ngram))) 31 | (cond 32 | ;; paragraph start is sentence start 33 | ((string= "¶" cur) 34 | (cons "" suffix)) 35 | ;; paragraph end 36 | ((string= "¶" (nth last-idx ngram)) 37 | (setf tail (nthcdr (1- last-idx) tail)) 38 | (append (butlast ngram) (list ""))) 39 | ;; sentence end 40 | ((and (upper-case-p 41 | (char (nth last-idx ngram) 0)) 42 | (every #'period-char-p 43 | (nth (1- last-idx) ngram))) 44 | (setf tail (append (list nil "") 45 | (nthcdr last-idx tail))) 46 | (append (butlast ngram) (list ""))) 47 | ;; inside sentence 48 | (t ngram))) 49 | ht 0))))) 50 | (when (= order 1) 51 | (when (get# "" ht) 52 | (set# "" ht (decf (get# "" ht))))) 53 | ht))) 54 | 55 | (defun index-context-freqs (words &key ignore-order) 56 | "Create a table of weighted conditional frequencies 57 | of 1-word contexts to each side of a word 58 | (if IGNORE-ORDER both left_right and right_left 59 | are normalized and treated as the same context) 60 | for each distinct word in WORDS." 61 | ;; TODO: generalize for broader contexts 62 | (let ((ctxs (make-hash-table :test 'equal))) 63 | (loop :for (prev cur next) :on (cons "" (append words (list ""))) 64 | :while next :do 65 | (unless (get# cur ctxs) 66 | (set# cur ctxs (make-hash-table :test 'equal))) 67 | (when (and (upper-case-p (char cur 0)) 68 | (ending-word-p prev)) 69 | (setf prev "")) 70 | (when (and (upper-case-p (char next 0)) 71 | (ending-word-p cur)) 72 | (setf next "")) 73 | (let ((prev_next (if (and ignore-order (string< next prev)) 74 | (strcat next "_" prev) 75 | (strcat prev "_" next)))) 76 | (set# prev_next (get# cur ctxs) 77 | (1+ (get# prev_next (get# cur ctxs) 0))))) 78 | (normalize-freqs ctxs))) 79 | 80 | (defun index-prefix-transition-freqs (words &key (n 1)) 81 | "Create a table of weighted conditional frequencies 82 | of next words for each distinct reversed N-word sequence in WORDS." 83 | (let ((transitions (make-hash-table :test 'equalp)) 84 | (limit (length words)) 85 | (count 0)) 86 | ;; traversing the list of words from end 87 | (loop :for tail :on (reverse (append (make-list n) words)) 88 | :while (< count limit) :do 89 | (incf count) 90 | (let* ((word (car tail)) 91 | (prefix (sub tail 1 (1+ n)))) 92 | (when (and (> n 1) (string= "¶" (car prefix))) 93 | (setf prefix (cons "¶" (make-list (1- n))))) 94 | (unless (get# prefix transitions) 95 | (set# prefix transitions (make-hash-table :test 'equal))) 96 | (set# word (get# prefix transitions) 97 | (1+ (get# word (get# prefix transitions) 0))))) 98 | (normalize-freqs transitions))) 99 | 100 | (defun index-word-transition-freqs (words) 101 | "Create a table of weighted conditional frequencies 102 | of next words for each distinct word in WORDS." 103 | (let ((transitions (make-hash-table :test 'equalp)) 104 | (word-vec (make-array (1+ (length words)) 105 | :initial-contents (cons "¶" words)))) 106 | (dotimes (i (1- (length words))) 107 | (let ((prev (elt word-vec i)) 108 | (cur (elt word-vec (+ i 1)))) 109 | (unless (get# prev transitions) 110 | (set# prev transitions (make-hash-table :test 'equal))) 111 | (set# cur (get# prev transitions) 112 | (1+ (get# cur (get# prev transitions) 0))))) 113 | (normalize-freqs transitions))) 114 | 115 | 116 | ;;; Collocations 117 | 118 | (defun find-collocations (bigrams &key (n 20)) 119 | "Find up to N strongest collocations in BIGRAMS." 120 | (let ((rez (make-hash-table :test 'equal)) 121 | (left (make-hash-table :test 'equal)) 122 | (right (make-hash-table :test 'equal)) 123 | (total (ngrams-total-freq bigrams))) 124 | (dotable (ngram freq (ngrams-table bigrams)) 125 | (ds-bind (l r) ngram 126 | (set# l left (+ freq (get# l left 0))) 127 | (set# r right (+ freq (get# r right 0))))) 128 | (dotable (ngram freq (ngrams-table bigrams)) 129 | (unless (reduce #'or2 130 | (mapcar #`(member % *stopwords-en* :test 'string-equal) 131 | ngram)) 132 | (let ((lfreq (- (get# (car ngram) left) freq)) 133 | (rfreq (- (get# (cadr ngram) right) freq))) 134 | (set# ngram rez 135 | (log-likelihood-ratio freq lfreq 136 | rfreq (- total lfreq rfreq freq)))))) 137 | (take n (sorted-ht-keys '> rez)))) 138 | 139 | 140 | ;;; Helpers 141 | 142 | (defun normalize-freqs (ht-of-hts) 143 | "For each table in HT-OF-HTS normalize all the values. 144 | Returns the modified HT-OF-HTS." 145 | (maphash #`(let ((total (reduce '+ (ht-vals %%)))) 146 | (dotable (k v %%) 147 | (set# k %% (/ v total)))) 148 | ht-of-hts) 149 | ht-of-hts) -------------------------------------------------------------------------------- /nltk/ch1-1.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nltk) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | ;;; Texts, we'll be working with 8 | 9 | (defclass text () 10 | ((name :initarg :name) 11 | (raw :initarg :raw :accessor text-raw) 12 | (words :accessor text-words) 13 | (ctxs :accessor text-ctxs) 14 | (transitions :accessor text-transitions) 15 | (dispersion :accessor text-dispersion))) 16 | 17 | (defmethod print-object ((text text) stream) 18 | (with-slots (name raw) text 19 | (let ((len (length raw))) 20 | (format stream "#<~A ~A... ~A>" 21 | name (subseq raw 0 (min len 20)) len)))) 22 | 23 | (defmethod slot-unbound (class (obj text) (slot (eql 'words))) 24 | (with-slots (raw words) obj 25 | (format t "~&Tokenizing text...~%") 26 | (prog1 (setf words (mapcan #`(cons "¶" (tokenize %)) 27 | (tokenize raw))) 28 | (format t "Number of words: ~A~%" (length words))))) 29 | 30 | (defmethod slot-unbound (class (obj text) (slot (eql 'ctxs))) 31 | (with-slots (words ctxs) obj 32 | (format t "~&Building word contexts...~%") 33 | (prog1 (setf ctxs (index-context-freqs words)) 34 | (format t "Number of unique words: ~A~%" (length (ht-keys ctxs)))))) 35 | 36 | (defmethod slot-unbound (class (obj text) (slot (eql 'transitions))) 37 | (rebuild-transitions obj 2)) 38 | 39 | (defmethod slot-unbound (class (obj text) (slot (eql 'dispersion))) 40 | (with-slots (words dispersion) obj 41 | (format t "~&Building dispersion table...~%") 42 | (let ((ht (make-hash-table :test 'equal))) 43 | (doindex (idx word words) 44 | (set# word ht (cons idx (get# word ht)))) 45 | (prog1 (setf dispersion ht) 46 | (format t "Number of words: ~A~%" (length (ht-keys dispersion))))))) 47 | 48 | (defun nltk-text-file (name) 49 | (merge-pathnames (fmt "nltk/data/~(~A~).txt" name) 50 | nutil:+project-root+)) 51 | 52 | (defvar *texts* (make-hash-table)) 53 | 54 | (defun load-nltk-texts (&optional (dir "data/")) 55 | "Load and cache texts, that we'll use in the examples." 56 | (dolist (text '(:moby :sense :genesis :inaugural :nps-chat)) 57 | (set# text *texts* 58 | (make 'text :name text 59 | :raw (string-trim +white-chars+ 60 | (read-file (nltk-text-file text)))))) 61 | (maphash #`(print %%) *texts*)) 62 | 63 | 64 | ;;; Main functions 65 | 66 | (defun concordance (text word &key (width 25) pass-newlines) 67 | "Print contexts (up to WIDTH chars) of WORD usage in TEXT. 68 | If PASS-NEWLINES isn't set, the context will be shown 69 | up to the closest newline." 70 | (print-word-in-contexts word (text-raw text) 71 | :width 30 :pass-newlines pass-newlines)) 72 | 73 | (defun similar (text word &key (n 20)) 74 | "Find N most similar words to WORD in TEXT." 75 | (let* ((ctxs (text-ctxs text)) 76 | (ctx (get# word ctxs)) 77 | (rez (make-hash-table :test 'equal))) 78 | (maphash #`(let ((common (match-ctxs ctx %%))) 79 | (unless (or (string= word %) 80 | (zerop (hash-table-count common))) 81 | (set# % rez (reduce #'+ (ht-vals common))))) 82 | ctxs) 83 | (take n (sorted-ht-keys '> rez)))) 84 | 85 | (defun common-contexts (text &rest words) 86 | "Find matching contexts between WORDS in TEXT." 87 | (with-slots (ctxs) text 88 | (when-it (reduce #`(match-ctxs (get# %% ctxs) %) 89 | (rest words) 90 | :initial-value (get# (car words) ctxs)) 91 | (sorted-ht-keys '> it)))) 92 | 93 | (defun generate (text &key (n 20) (order 2)) 94 | "Generate random text of N words, based on TEXT." 95 | (with-slots (transitions) text 96 | (string-trim (append +white-chars+ +newline-chars+) 97 | (fmt "~{~A ~}" 98 | (generate-text (make 'markov-chain-generator :order order) 99 | (if (= order 100 | (length (car (ht-keys transitions)))) 101 | transitions 102 | (rebuild-transitions text order)) 103 | n))))) 104 | 105 | (defun rebuild-transitions (text n) 106 | "Rebuild transition index" 107 | (with-slots (words transitions) text 108 | (format t "~&Building prefix transitions index of length ~A...~%" n) 109 | (prog1 (setf transitions (index-prefix-transition-freqs words :n n)) 110 | (format t "Number of prefixes: ~A~%" (length (ht-keys transitions)))))) 111 | 112 | 113 | (defun dispersion-plot (text &rest words) 114 | "Plot dispersion of WORDS in TEXT." 115 | (plot-dispersion words (dump-data-for-plot (reverse words) 116 | (text-dispersion text)))) 117 | 118 | (defun lexical-diversity (text) 119 | "Calculate lexical diversity measure of TEXT." 120 | (with-slots (words) text 121 | (/ (float (length words)) 122 | (hash-table-count (uniq words :raw t))))) 123 | 124 | (defun percentage (count total) 125 | "Return percentage of count / total." 126 | (/ (* 100.0 count) total)) 127 | 128 | 129 | ;;; Helper functions 130 | 131 | (defun uniq (list &key raw case-insensitive) 132 | "Return only unique elements from LIST either as a new list 133 | or as hash-table if RAW is set. Can be CASE-INSENSITIVE." 134 | (let ((uniqs (make-hash-table :test (if case-insensitive 'equalp 'equal)))) 135 | (dolist (elt list) 136 | (set# elt uniqs t)) 137 | (if raw uniqs (ht-keys uniqs)))) 138 | 139 | (defun match-ctxs (word1-ctx word2-ctx) 140 | "Find the intersection between WORD1-CTX and WORD2-CTX tables 141 | and for each common context calculate the commonality weight." 142 | (let ((rez (make-hash-table :test 'equal))) 143 | (when (and word1-ctx word2-ctx) 144 | (dolist (k (intersection (ht-keys word1-ctx) (ht-keys word2-ctx) 145 | :test 'string=)) 146 | (set# k rez (* (get# k word1-ctx) 147 | (get# k word2-ctx))))) 148 | rez)) 149 | 150 | (defun dump-data-for-plot (words dispersion-table) 151 | "Dump data from DISPERSION-TABLE for WORDS into a temporary file 152 | and return its name." 153 | (let ((filename (fmt "/tmp/~A" (gensym)))) 154 | (with-out-file (out filename) 155 | (format out "0~t0~t~%") 156 | (doindex (i word words) 157 | (dolist (idx (get# word dispersion-table)) 158 | (format out "~A~t~A~t~A~%" idx (1+ i) word))) 159 | (format out "0~t~A~t~%" (1+ (length words)))) 160 | filename)) 161 | 162 | (defun plot-dispersion (words file) 163 | "Plot WORDS dispersion data from FILE." 164 | (cgn:start-gnuplot) 165 | (cgn:format-gnuplot "set title \"Lexical Dispersion Plot\"") 166 | (cgn:format-gnuplot "plot \"~A\" using 1:2:yticlabels(3) title \"\"" file)) 167 | 168 | 169 | ;;; Brown corpus information 170 | 171 | #+manually ( 172 | (format t " Genre | Tokens | Types | Lexical Diversity") 173 | (format t "~%--------------------+----------+---------+--------------------") 174 | (maphash #`(with-accessors ((tokens (corpus-text-tokens +brow-corpus))) %% 175 | (let* ((words (mapcar #'ncorp::token-word (flatten tokens))) 176 | (words-count (length words)) 177 | (types-count (hash-table-count (uniq words :raw t)))) 178 | (format t "~&~20A| ~7A | ~6A | ~6,1F~%" 179 | (subseq (symbol-name %) 180 | 0 (min 19 (length (symbol-name %)))) 181 | words-count 182 | types-count 183 | (/ (* 1.0 words-count) types-count)))) 184 | +brown-corpus+) 185 | ) -------------------------------------------------------------------------------- /src/core/tokenization.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.core) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | (defgeneric tokenize (tokenizer string) 8 | (:documentation 9 | "Tokenize STRING with TOKENIZER. Outputs 2 values: 10 | 11 | - list of words 12 | - list of spans as beg-end cons pairs 13 | ")) 14 | 15 | ;; (defgeneric stream-tokenize (tokenizer input output &optional span-output) 16 | ;; (:documentation 17 | ;; "Tokenize INPUT with TOKENIZER and writes to OUTPUT one word per line. 18 | ;; Also can write to SPAN-OUTPUT beg-end cons pairs related to words. 19 | ;; 20 | ;; Usage example: 21 | ;; 22 | ;; (let ((pipe (make-two-way-stream 23 | ;; (make-input-string-stream) (make-output-string-stream)))) 24 | ;; (bt:make-thread #`(stream-tokenize *standard-input* pipe)) 25 | ;; (loop :for line := (read-line pipe nil) :while line :do 26 | ;; (print line))) 27 | ;; ")) 28 | 29 | 30 | (defclass tokenizer () 31 | () 32 | (:documentation 33 | "Base class for tokenizers.")) 34 | 35 | (defmethod tokenize :around ((tokenizer tokenizer) string) 36 | "Pre-split text into lines and tokenize each line separately." 37 | (let ((offset 0) 38 | words spans) 39 | (loop :for line :in (split-sequence #\Newline string) :do 40 | (mv-bind (ts ss) (call-next-method tokenizer line) 41 | (setf words (nconc words ts) 42 | spans (nconc spans (mapcar #`(cons (+ (car %) offset) 43 | (+ (cdr %) offset)) 44 | ss))) 45 | (incf offset (1+ (length line))))) 46 | (values words 47 | spans))) 48 | 49 | 50 | ;;; Word tokenization 51 | 52 | (defclass regex-word-tokenizer (tokenizer) 53 | ((regex :accessor tokenizer-regex :initarg :regex 54 | :initform 55 | (re:create-scanner 56 | "(\\w+|[!\"#$%&'*+,./:;<=>?@^`~…\\(\\)⟨⟩{}\\[\\|\\]‒–—―«»“”‘’¶-])") 57 | :documentation 58 | "A simpler variant would be [^\\s]+ 59 | — it doesn't split punctuation, yet sometimes it's desirable.")) 60 | (:documentation 61 | "Regex-based word tokenizer.")) 62 | 63 | (defmethod tokenize ((tokenizer regex-word-tokenizer) string) 64 | (loop :for (beg end) :on (re:all-matches (tokenizer-regex tokenizer) string) 65 | :by #'cddr 66 | :collect (subseq string beg end) :into words 67 | :collect (cons beg end) :into spans 68 | :finally (return (values words 69 | spans)))) 70 | 71 | (define-lazy-singleton word-tokenizer (make 'regex-word-tokenizer) 72 | "Basic word tokenizer.") 73 | 74 | (define-lazy-singleton word-chunker 75 | (make 'regex-word-tokenizer :regex (re:create-scanner "[^\\s]+")) 76 | "Dumb word tokenizer, that will not split punctuation from words.") 77 | 78 | 79 | (defclass treebank-word-tokenizer (tokenizer) 80 | () 81 | (:documentation 82 | "The Treebank tokenizer uses regular expressions to tokenize text 83 | as in Penn Treebank. It's a port of Robert MacIntyre's tokenizer 84 | (see: ). 85 | It assumes that the text has already been split into sentences. 86 | 87 | This tokenizer performs the following steps: 88 | 89 | - split standard contractions: don't -> do n't, they'll -> they 'll 90 | - treat most punctuation characters as separate tokens 91 | - split off commas and single quotes, when followed by whitespace 92 | - separate periods that appear at the end of line 93 | ")) 94 | 95 | ;; (let ((contractions-regex 96 | ;; (re:create-scanner (s+ "(" 97 | ;; "([^' ])('[sS]|'[mM]|'[dD]|') " 98 | ;; "|" 99 | ;; "([^' ])('ll|'re|'ve|n't|) " 100 | ;; "|" 101 | ;; "\\b(" 102 | ;; "(can)(not)" 103 | ;; "|(d)('ye)" 104 | ;; "|(gim)(me)" 105 | ;; "|(gon)(na)" 106 | ;; "|(got)(ta)" 107 | ;; "|(lem)(me)" 108 | ;; "|(wan)(na)" 109 | ;; "|(mor)('m)" 110 | ;; ")\\b" 111 | ;; "|" 112 | ;; " ('t)(is|was)\\b" 113 | ;; ")" 114 | ;; :case-insensitive-mode t)) 115 | ;; (contractions3-regex (re:create-scanner 116 | ;; :case-insensitive-mode t))) 117 | ;; (defmethod tokenize ((tokenizer treebank-word-tokenizer) string) 118 | ;; (re-setf string 119 | ;; ;; starting quotes 120 | ;; (re:regex-replace-all "^\"" "``") 121 | ;; (re:regex-replace-all "(``)" " \\1 ") 122 | ;; (re:regex-replace-all "([ (\[{<])\"" "\\1 `` ") 123 | ;; (re:regex-replace-all "^\"" "``") 124 | ;; (re:regex-replace-all "^\"" "``") 125 | ;; ;; punctuation 126 | ;; (re:regex-replace-all "([:,])([^\\d])" " \\1 \\2 ") 127 | ;; (re:regex-replace-all "\\.\\.\\." " ... ") 128 | ;; (re:regex-replace-all "[;@#$%&?!]" " \\0 ") 129 | ;; (re:regex-replace-all "([^\\.])(\\.)([\]\)}>\"']*)\\s*$" "\\1 \\2\\3 ") 130 | ;; (re:regex-replace-all "--" " \\0 ") 131 | ;; (re:regex-replace-all "[\\]\\[\\(\\)\\{\\}\\<\\>]" " \\0 ") 132 | ;; (strcat " " " ") 133 | ;; ;; quotes 134 | ;; (re:regex-replace-all "([^'])(') " "\\1 \\2 ") 135 | ;; (re:regex-replace-all "\"" " '' ") 136 | ;; (re:regex-replace-all "(\\S)(\\'\\')" "\\1 \\2 ") 137 | 138 | ;; (re:regex-replace-all contractions-regex "\\1 \\2 ") 139 | ;; (re:regex-replace-all " +" " ") 140 | ;; (string-trim " ")) 141 | ;; (unless (blankp string) 142 | ;; (setf text (strcat string " "))) 143 | ;; (tokenize string)) 144 | 145 | ;;; Sentence splitting 146 | 147 | (defclass baseline-sentence-tokenizer (tokenizer) 148 | () 149 | (:documentation 150 | "Basic tokenizer for sentence splitting.")) 151 | 152 | (defparameter +abbrevs-with-dot+ 153 | (list-from-file (data-file "abbrevs-with-dot.txt")) 154 | "Widely-used abbreviations ending in dot.") 155 | 156 | (defmethod tokenize ((tokenizer baseline-sentence-tokenizer) string) 157 | (mv-bind (words word-spans) 158 | (tokenize (make 'regex-word-tokenizer :regex "[^\\s]+") 159 | (substitue #\¶ #\Newline string)) 160 | (let ((beg 0) 161 | sentences spans) 162 | (loop :for ws :on words :and ss :on word-spans :do 163 | (let ((word (first ts)) 164 | (span (first ss))) 165 | (when (or (null (rest ts)) 166 | (and (member (char word (1- (length word))) 167 | '(#\. #\? #\! #\¶)) 168 | (not (member word +abbrevs-with-dot+ 169 | :test #'string-equal)) 170 | (and-it (second ts) 171 | (upper-case-p (char it 0))))) 172 | (push (subseq string beg (cdr span)) sentences) 173 | (push (cons beg (cdr span)) spans) 174 | (setf beg (car (second ss)))))) 175 | (values (reverse sentences) 176 | (reverse spans))))) 177 | 178 | (define-lazy-singleton sentence-tokenizer (make 'baseline-sentence-tokenizer) 179 | "Basic sentence splitter.") 180 | 181 | 182 | ;;; Paragraph splitting 183 | 184 | (defclass doublenewline-paragraph-splitter () 185 | () 186 | (:documentation 187 | "Paragraph tokenizer that splits text on double newlines 188 | and removes single newlines.")) 189 | 190 | (defmethod tokenize ((tokenizer doublenewline-paragraph-splitter) string) 191 | (let ((newline-regex (re:create-scanner 192 | (fmt "(~C|[~C~C]{1,2})" #\Newline #\Return #\Linefeed)))) 193 | (mapcar #`(fmt "~{~A ~}" %) 194 | (split-sequence-if #'blankp 195 | (re:split newline-regex string) 196 | :remove-empty-subseqs t)))) 197 | 198 | (define-lazy-singleton paragraph-splitter 199 | (make 'doublenewline-paragraph-splitter) 200 | "Basic paragraph splitter.") 201 | 202 | 203 | ;;; Helpers 204 | 205 | (defmacro re-setf (var &body body) 206 | "For each clause in BODY wrap it in `(setf var (clause arg1 var args))`." 207 | `(progn 208 | ,@(mapcar (lambda (clause) 209 | `(setf ,var (,(car clause) 210 | ,@(when-it (cadr clause) (list it)) 211 | ,var 212 | ,@(when-it (cadr clause) (nthcdr 2 clause))))) 213 | body))) 214 | -------------------------------------------------------------------------------- /src/core/ngrams.lisp: -------------------------------------------------------------------------------- 1 | ;;; (c) 2013 Vsevolod Dyomkin 2 | 3 | (in-package #:nlp.core) 4 | (named-readtables:in-readtable rutils-readtable) 5 | 6 | 7 | ;;; Abstract Ngrams 8 | 9 | (defclass ngrams () 10 | ((order :initarg :order :reader ngrams-order) 11 | (count :reader ngrams-count) 12 | (max-freq :reader ngrams-max-freq) 13 | (min-freq :reader ngrams-min-freq) 14 | (total-freq :reader ngrams-total-freq)) 15 | (:documentation 16 | "An abstract ngrams interface.")) 17 | 18 | (defmethod print-object ((ngrams ngrams) stream) 19 | (print-unreadable-object (ngrams stream :type t :identity t) 20 | (if (slot-boundp ngrams 'order) 21 | (with-accessors ((order ngrams-order) (count ngrams-count) 22 | (total-freq ngrams-total-freq)) ngrams 23 | (format stream "order:~A count:~A outcomes:~A" 24 | order count total-freq)) 25 | (format stream "not initialized")))) 26 | 27 | (defgeneric ngrams-eq (ngrams) 28 | (:documentation 29 | "Get the equality predicate of NGRAMS (can be EQUAL or EQUALP).")) 30 | 31 | (defgeneric ngrams-pairs (ngrams &key order-by) 32 | (:documentation 33 | "Get the alist of all ngrams with their frequencies in NGRAMS, 34 | possibly ordered by ORDER-BY predicate (e.g. < or >).")) 35 | 36 | (defgeneric vocab (ngrams &key order-by) 37 | (:documentation 38 | "Get the list of all ngrams in NGRAMS, 39 | possibly ordered by ORDER-BY predicate (e.g. < or >).")) 40 | 41 | (defgeneric freq (ngrams ngram) 42 | (:documentation 43 | "Get the NGRAM frequency in NGRAMS.") 44 | (:method :around ((ngrams ngrams) (ngram string)) 45 | (if (> (ngrams-order ngrams) 1) 46 | (freq ngrams (tokenize-ngram ngrams ngram)) 47 | (call-next-method)))) 48 | 49 | (defgeneric prob (ngrams ngram) 50 | (:documentation 51 | "Get the NGRAM probability in NGRAMS.") 52 | (:method ((ngrams ngrams) ngram) 53 | (/ (freq ngrams ngram) 54 | (ngrams-total-freq ngrams))) 55 | (:method :around ((ngrams ngrams) (ngram string)) 56 | (if (> (ngrams-order ngrams) 1) 57 | (prob ngrams (tokenize-ngram ngrams ngram)) 58 | (call-next-method)))) 59 | 60 | (defgeneric logprob (ngrams ngram) 61 | (:documentation 62 | "Get the log (to base 2) of NGRAM probability in NGRAMS.") 63 | (:method ((ngrams ngrams) ngram) 64 | (let ((prob (prob ngrams ngram))) 65 | (if (zerop prob) 66 | nil 67 | (* (log prob) #.(/ 1 (log 2)))))) 68 | (:method :around ((ngrams ngrams) (ngram string)) 69 | (if (> (ngrams-order ngrams) 1) 70 | (logprob ngrams (tokenize-ngram ngrams ngram)) 71 | (call-next-method)))) 72 | 73 | (defgeneric cond-prob (ngrams ngram) 74 | (:documentation 75 | "Get the NGRAM conditional probability in NGRAMS. 76 | By conditional probability we mean the probability of occurrence 77 | of the last word given the previous words.") 78 | (:method :around ((ngrams ngrams) ngram) 79 | (if (= 1 (ngrams-order ngrams)) 80 | 1 81 | (call-next-method))) 82 | (:method :around ((ngrams ngrams) (ngram string)) 83 | (if (> (ngrams-order ngrams) 1) 84 | (cond-prob ngrams (tokenize-ngram ngrams ngram)) 85 | (call-next-method)))) 86 | 87 | (defgeneric cond-logprob (ngrams ngram) 88 | (:documentation 89 | "Get the log of NGRAM conditional probability in NGRAMS. 90 | By conditional probability we mean the probability of occurrence 91 | of the last word given the previous words.") 92 | (:method :around ((ngrams ngrams) (ngram string)) 93 | (if (> (ngrams-order ngrams) 1) 94 | (cond-logprob ngrams (tokenize-ngram ngrams ngram)) 95 | (call-next-method)))) 96 | 97 | (defgeneric freqs (ngrams &rest ngrams-list) 98 | (:documentation 99 | "Get the list of frequencies of ngrams from NGRAMS-LIST in NGRAMS.") 100 | (:method (ngrams &rest ngrams-list) 101 | (mapcar #`(freq ngrams %) ngrams-list))) 102 | 103 | (defgeneric probs (ngrams &rest ngrams-list) 104 | (:documentation 105 | "Get the list of probabilities of ngrams from NGRAMS-LIST in NGRAMS.") 106 | (:method (ngrams &rest ngrams-list) 107 | (mapcar #`(prob ngrams %) ngrams-list))) 108 | 109 | (defgeneric logprobs (ngrams &rest ngrams-list) 110 | (:documentation 111 | "Get the list of logs of probability of ngrams from NGRAMS-LIST in NGRAMS.") 112 | (:method (ngrams &rest ngrams-list) 113 | (mapcar #`(logprob ngrams %) ngrams-list))) 114 | 115 | (defgeneric cond-probs (ngrams &rest ngrams-list) 116 | (:documentation 117 | "Get the conditional probabilities of ngrams from NGRAMS-LIST in NGRAMS. 118 | By conditional probability we mean the probability of occurrence 119 | of the last word given the previous words.") 120 | (:method ((ngrams ngrams) &rest ngrams-list) 121 | (mapcar #`(cond-prob ngrams %) ngrams-list))) 122 | 123 | (defgeneric cond-logprobs (ngrams &rest ngrams-list) 124 | (:documentation 125 | "Get the logs of conditional probability of ngrams from NGRAMS-LIST in NGRAMS. 126 | By conditional probability we mean the probability of occurrence 127 | of the last word given the previous words.") 128 | (:method ((ngrams ngrams) &rest ngrams-list) 129 | (mapcar #`(cond-logprob ngrams %) ngrams-list))) 130 | 131 | (defgeneric top-ngram (ngrams) 132 | (:documentation 133 | "Get some ngram with the highest frequency in NGRAMS.")) 134 | 135 | (defgeneric hapaxes (ngrams) 136 | (:documentation 137 | "Get all the ngrams with the lowest frequency in NGRAMS. 138 | Second value is the frequency itself.")) 139 | 140 | 141 | ;;; Table-based ngrams 142 | 143 | (defclass table-ngrams (ngrams) 144 | ((table :initform (make-hash-table :test 'equal) :initarg :table 145 | :reader ngrams-table)) 146 | (:documentation 147 | "Ngrams with hash-table source.")) 148 | 149 | (defmethod initialize-instance :after ((ngrams table-ngrams) &key) 150 | (with-slots (table order max-freq min-freq total-freq) ngrams 151 | (check-type table hash-table) 152 | (assert (member (hash-table-test table) '(equal equalp))) 153 | (with-hash-table-iterator (gen-fn table) 154 | (when-it (nth-value 2 (gen-fn)) 155 | (setf total-freq (setf max-freq (setf min-freq it))) 156 | (loop 157 | (mv-bind (next? _ freq) (gen-fn) 158 | (unless next? (return)) 159 | (incf total-freq freq) 160 | (when (< freq min-freq) 161 | (setf min-freq freq)) 162 | (when (> freq max-freq) 163 | (setf max-freq freq)))))))) 164 | 165 | (defmethod ngrams-count ((ngrams table-ngrams)) 166 | (hash-table-count (ngrams-table ngrams))) 167 | 168 | (defmethod ngrams-eq ((ngrams table-ngrams)) 169 | (hash-table-test (ngrams-table ngrams))) 170 | 171 | (defmethod vocab ((ngrams table-ngrams) &key order-by) 172 | (with-slots (table) ngrams 173 | (if order-by 174 | (mapcar #'car (sort (ngrams-pairs ngrams) order-by :key #'cdr)) 175 | (ht-keys table)))) 176 | 177 | (defmethod ngrams-pairs ((ngrams table-ngrams) &key order-by) 178 | (with-slots (table) ngrams 179 | (if order-by 180 | (sort (ht->alist table) order-by :key #'cdr) 181 | (ht->alist table)))) 182 | 183 | ;; (defmethod ngrams-pairs ((ngrams table-ngrams)) 184 | ;; (let ((total 0) 185 | ;; (freq 0) 186 | ;; (eq-test (ngrams-eq ngrams)) 187 | ;; (prefix (butlast ngram))) 188 | ;; (dolist (ng (vocab ngrams)) 189 | ;; (cond ((funcall eq-test ngram ng) 190 | ;; (incf total (setf freq (freq ng)))) 191 | ;; ((funcall eq-test prefix (butlast ng)) 192 | ;; (incf total (freq ng))))) 193 | ;; (if (zerop total) 0 194 | ;; (/ freq total)))) 195 | 196 | (defmethod freq ((ngrams table-ngrams) ngram) 197 | (get# ngram (ngrams-table ngrams) 0)) 198 | 199 | (defmethod freqs ((ngrams table-ngrams) &rest ngrams-list) 200 | (mapcar #`(freq (ngrams-table ngrams) %) 201 | ngrams-list)) 202 | 203 | (defmethod probs ((ngrams table-ngrams) &rest ngrams-list) 204 | (mapcar #`(prob (ngrams-table ngrams) %) 205 | ngrams-list)) 206 | 207 | (defmethod cond-prob ((ngrams hash-table) ngram) 208 | (let ((total 0) 209 | (freq 0) 210 | (eq-test (ngrams-eq ngrams)) 211 | (prefix (butlast ngram))) 212 | (maphash #`(cond ((funcall eq-test ngram %) 213 | (incf total (setf freq %%))) 214 | ((funcall eq-test prefix (butlast %)) 215 | (incf total %%))) 216 | (ngrams-source ngrams)) 217 | (if (zerop total) 0 218 | (/ freq total)))) 219 | 220 | (defmethod top-ngram ((ngrams table-ngrams)) 221 | (with-slots (table max-freq) ngrams 222 | (dotable (ngram freq table) 223 | (when (= (max-freq ngrams) freq) 224 | (return ngram))))) 225 | 226 | (defmethod hapaxes ((ngrams table-ngrams)) 227 | (let (rez) 228 | (with-slots (table min-freq) ngrams 229 | (dotable (ngram freq table) 230 | (when (= min-freq freq) 231 | (push ngram rez))) 232 | (values rez 233 | min-freq)))) 234 | 235 | 236 | ;;; Helper functions 237 | 238 | (defun tokenize-ngram (ngrams str) 239 | "Transform string STR to a list if necessary (depending of order of NGRAMS)." 240 | (if (> (ngrams-order ngrams) 1) 241 | (tokenize str) 242 | str)) 243 | -------------------------------------------------------------------------------- /nltk/ch1-2.md: -------------------------------------------------------------------------------- 1 | # NLTK 1.3 - Computing with Language: Simple Statistics 2 | 3 | Most of the remaining parts of the first chapter of NLTK book serve as 4 | an introduction to Python in the context of text processing. I won't 5 | translate that to Lisp, because there're much better resources 6 | explaining how to use Lisp properly. First and foremost I'd refer 7 | anyone interested to the appropriate chapters of 8 | [Practical Common Lisp](http://gigamonkeys.com/book): 9 | 10 | - [List Processing](http://gigamonkeys.com/book/they-called-it-lisp-for-a-reason-list-processing.html) 11 | - [Collections](http://gigamonkeys.com/book/collections.html) 12 | - [Variables](http://gigamonkeys.com/book/variables.html) 13 | - [Macros: Standard Control Constructs](http://gigamonkeys.com/book/macros-standard-control-constructs.html) 14 | 15 | It's only worth noting that Lisp has a different notion of lists, than 16 | Python. Lisp's lists are linked lists, while Python's are essentially 17 | vectors. Lisp also has vectors as a separate data-structure, and it 18 | also has multidimensional arrays (something Python mostly lacks). And 19 | the set of Lisp's list operations is somewhat different from 20 | Python's. List is the default sequence data-structure, but you should 21 | understand its limitations and know, when to switch to vectors (when 22 | you will have a lot of elements and often access them at random). Also 23 | Lisp doesn't provide Python-style syntactic sugar for slicing and 24 | dicing lists, although all the operations are there in the form of 25 | functions. The only thing which isn't easily reproducible in Lisp is 26 | assigning to a slice: 27 | 28 | >>> sent[1:9] = ['Second', 'Third'] 29 | >>> sent 30 | ['First', 'Second', 'Third', 'Last'] 31 | 32 | There's `replace` but it can't shrink a sequence: 33 | 34 | CL-USER> (defvar sent '(1 2 3 4 5 6 7 8 9 0)) 35 | CL-USER> (replace sent '("Second" "Third") :start1 1 :end1 9) 36 | (1 "Second" "Third" 4 5 6 7 8 9 0) 37 | 38 | ## Ngrams 39 | 40 | So, the only part worth discussing here is statistics. 41 | 42 | Let's start with a __frequency distribution__. We have already used 43 | something similar in the previous part for text generation, but it was 44 | very basic and tailored to the task. Now, it's time to get into some 45 | serious language modeling and discuss a more general-purpose 46 | implementation. 47 | 48 | Such modeling is accomplished via collecting of large amounts of 49 | statistical data about words and their sequences appearances in 50 | texts. These sequences are called __ngrams__. In a nutshell, you can 51 | think of ngrams distribution as a table mapping ngram sequences to 52 | numbers. 53 | 54 | (defclass ngrams () 55 | ((order :initarg :order :reader ngrams-order) 56 | (count :reader ngrams-count) 57 | (max-freq :reader ngrams-max-freq) 58 | (min-freq :reader ngrams-min-freq) 59 | (total-freq :reader ngrams-total-freq))) 60 | 61 | The crucial parameter of this class is `order` which defines the 62 | length of a sequence. In practice, ngrams of order from 1 to 5 may be 63 | used. 64 | 65 | `ngrams` is an abstract class. In Lisp you don't have to somehow 66 | specify this property, you just don't implement methods for it. The 67 | simplest `ngrams` implementation — `table-ngrams` — uses an in-memory 68 | hash-table as a store. You can get ngram frequency and "probability" 69 | (the maximum likelihood estimation) from it, as well as log of 70 | probability which is used more often in calculations, because it 71 | allows to avoid the problem of floating point rounding errors 72 | occurring when multiplying probabilities which are rather small 73 | values. 74 | 75 | NLTK> (freq (text-bigrams *moby*) "The whale") 76 | Indexing bigrams... 77 | Number of bigrams: 116727 78 | 14 79 | NLTK> (logprob (text-bigrams *moby*) "The whale") 80 | -14.255587 81 | 82 | So how do we get bigrams of Moby Dick? For that we just have to count 83 | all of them in text (this is a simplified version — some additional 84 | processing for sentence start/ends is needed): 85 | 86 | (defun index-ngrams (order words &key ignore-case) 87 | (make 'table-ngrams :order order 88 | :table 89 | (let ((ht (make-hash-table :test (if ignore-case 'equalp 'equal)))) 90 | (do ((tail words (rest tail))) 91 | ((shorter? tail order)) 92 | (incf (get# (if (= order 1) 93 | (car tail) 94 | (sub tail 0 order)) 95 | ht 0))) 96 | ht))) 97 | 98 | `table-ngrams` will be useful for simple experimentation and prototyping, 99 | like we do in our NLTK examples. 100 | 101 | NLTK> (defvar *1grams* (text-ugrams *moby*)) 102 | Indexing unigrams... 103 | Number of unigrams: 19244 104 | NLTK> (freq *1grams* "whale") 105 | 906 106 | NLTK> (take 50 (vocab *1grams* :order-by '>)) 107 | ("," "the" "" "" "." "of" "and" "-" "a" "to" ";" "in" "\"" "that" "'" 108 | "his" "it" "I" "!" "s" "is" "he" "with" "was" "as" "all" "for" "this" "at" 109 | "by" "but" "not" "him" "from" "be" "on" "?" "so" "whale" "one" "you" "had" 110 | "have" "there" "But" "or" "were" "now" "which" "me") 111 | 112 | The strings "" and "" here denote special symbols for sentence 113 | start and end. 114 | 115 | Here's a cumulative plot of them: 116 | 117 | ![Cumulative Frequency Plot for 50 Most Frequent Words in Moby Dick](http://img.photobucket.com/albums/v473/pufpuf/ccounts_zpsbc41c690.png) 118 | 119 | And here's just the counts graph: 120 | 121 | ![Frequency Plot for 50 Most Frequent Words in Moby Dick](http://img.photobucket.com/albums/v473/pufpuf/counts_zpsa3d96079.png) 122 | 123 | And, finally, here's hapaxes: 124 | 125 | NLTK> (take 50 (hapaxes (text-ugrams *moby*))) 126 | ("orphan" "retracing" "sheathed" "padlocks" "dirgelike" "Buoyed" "liberated" 127 | "Till" "Ixion" "closing" "suction" "halfspent" "THEE" "ESCAPED" "ONLY" 128 | "Epilogue" "thrill" "etherial" "intercept" "incommoding" "tauntingly" 129 | "backwardly" "coincidings" "ironical" "intermixingly" "whelmings" "inanimate" 130 | "animate" "lookouts" "infatuation" "Morgana" "Fata" "gaseous" "mediums" 131 | "bewildering" "bowstring" "mutes" "voicelessly" "THUS" "grapple" 132 | "unconquering" "comber" "foregone" "bullied" "uncracked" "unsurrendered" 133 | "Diving" "flume" "dislodged" "buttress") 134 | 135 | The next Python feature showcased here is __list comprehensions__. 136 | The idea behind them is to resemble theoretical-set notation in list 137 | definition. There's no such thing out-of-the box in Lisp (although you 138 | can implement an even closer to set-notation variant in 139 | [just 24 lines](http://lisp-univ-etc.blogspot.com/2013/01/real-list-comprehensions-in-lisp.html)), 140 | and the general approach is to favor functional style filtering with 141 | variants of `map` and `remove-if`. 142 | 143 | NLTK> (sort (remove-if #`(< (length %) 15) 144 | (uniq (text-words *moby*))) 145 | 'string<) 146 | ("CIRCUMNAVIGATION" "Physiognomically" "apprehensiveness" "cannibalistically" "characteristically" "circumnavigating" "circumnavigation" "circumnavigations" "comprehensiveness" "hermaphroditical" "indiscriminately" "indispensableness" "irresistibleness" "physiognomically" "preternaturalness" "responsibilities" "simultaneousness" "subterraneousness" "supernaturalness" "superstitiousness" "uncomfortableness" "uncompromisedness" "undiscriminating" "uninterpenetratingly") 147 | NLTK> (sort (remove-if #`(or (<= (length %) 7) 148 | (<= (freq (text-ugrams *chat*) %) 7)) 149 | (vocab (text-ugrams *chat*))) 150 | 'string<) 151 | ("20sUser104" <... another 130 users ...> "Question" "actually" "anything" "computer" "everyone" "football" "innocent" "listening" "remember" "seriously" "something" "talkcity_adults" "thinking" "together" "watching") 152 | 153 | In NLTK variant all users are removed from the corpus with some pre-processing. 154 | 155 | ## Language Modeling 156 | 157 | But to be useful for real-world scenarios ngrams have to be large, 158 | really large (on the orders of tens of gigabytes of data for 159 | trigrams). This means that you won't be able to simply store them in 160 | memory and will have to use some external storage: a general-purpose 161 | data-store, like the relational database or a special-purpose 162 | software. 163 | 164 | One such ngrams service that is available on the internet is 165 | [Microsoft Web N-gram Services](http://web-ngram.research.microsoft.com/). 166 | If you have a developer token you can query it over HTTP. The service 167 | only returns log-probabilities and also log-conditional-probabilities 168 | and runs really slow, but it is capable of serving batch requests, 169 | i.e. return probabilities for several ngrams at once. The 170 | implementation of `ngrams` interface for such service is provided in 171 | [contrib/ms-ngrams.lisp](https://github.com/vseloved/cl-nlp/blob/master/src/contrib/ms-ngrams.lisp). 172 | 173 | We have already encountered conditional probabilities in the previous 174 | part. They have the following relationship with regular (so called, 175 | "joint") probabilities (for bigrams): 176 | 177 | p(A,B) = p(B|A) * p(A) 178 | where P(A,B) is a joint probability and P(B|A) is the conditional one 179 | 180 | I.e. they can be calculated from current ngrams plus the ngrams of 181 | preceding order. So, this operation is performed not on a single 182 | `ngrams` object, but on a pair of such objects. And they serve an 183 | important role we'll see below. But first we need to talk about 184 | language models. 185 | 186 | A __language model__ is, basically, a collection of ngrams of 187 | different orders. Combining these ngrams we're able to obtain some 188 | other measures beyond a simple frequency value or probability 189 | estimate. The biggest added value of such model is in smoothing 190 | capabilities that it implements. The problem smoothing solves is that 191 | you'll almost never be able to have all possible ngrams in your 192 | data-store — there's just too many of them and the language users keep 193 | adding more. But it's very nasty to get 0 probability for some 194 | ngram. The language model allows to find a balance between the number 195 | of ngrams you have to store and the possibility to get meaningful 196 | probability numbers for any ngram. This is achieved with various 197 | smoothing techniques: interpolation and discounting. Some of the 198 | smoothing methods are: 199 | 200 | - +1 smoothing 201 | - Kneser-Ney smoothing 202 | - and Stupid backoff 203 | 204 | A good general compilation of various smoothing methods is assembled in 205 | [this presentation](http://courses.washington.edu/ling570/fei_fall09/10_26_Smoothing.pdf). 206 | 207 | Let's look at the simplified implementation of scoring a sentence 208 | with the Stupid Backoff model: 209 | 210 | (defmethod logprob ((model language-model) (sentence list)) 211 | (with-slots (order) model 212 | (let ((rez 0) 213 | (s (append (cons "" sentence) (list "")))) 214 | (when (shorter? s order) 215 | (return-from logprob (logprob (get-ngrams (length s) model) s))) 216 | ;; start of the sentence: p(A|) * p(B|,A) * ... 217 | (do ((i 2 (1+ i))) 218 | ((= i order)) 219 | (incf rez (cond-logprob model (sub s 0 i)))) 220 | ;; middle of the sentence 221 | (do ((tail s (rest tail))) 222 | ((shorter? tail order)) 223 | (incf rez (cond-logprob model (sub tail 0 order)))) 224 | rez))) 225 | 226 | Eventually, the language model is able to return the estimated 227 | probability of any sequence of words, not limited to the maximum order 228 | of ngram in it. This is usually calculated using the Markov assumption 229 | with the following formula (for a bigram language model): 230 | 231 | p(s) = p(A) * p(B|A) * p(C|A,B) * p(D|B,C) ... * p(Z|X,Y) 232 | where s = A B ... Z 233 | 234 | NLTK> (defvar *moby-lm2* 235 | (make-lm 'stupid-backoff-lm 236 | :1g (text-ugrams *moby*) 237 | :2g (text-bigrams *moby*))) 238 | NLTK> (prob *moby-lm2* "This is a test sentence.") 239 | 6.139835e-20 240 | 241 | That was, by the way, the probability of an unseen sentence with the 242 | word "sentence" completely missing from vocabulary. 243 | 244 | NLTK> (prob *moby-lm2* '("" "Moby" "Dick" "." "")) 245 | 5.084481e-9 246 | NLTK> (float (prob (text-bigrams *moby*) '("Moby" "Dick"))) 247 | 3.0310333e-4 248 | 249 | As you see, it's much more likely to encounter the sentence "Moby Dick." 250 | in this text, although not so likely as the phrase "Moby Dick". :) 251 | 252 | Also such model is able to generate random texts just like we did in 253 | the previous part. But because of the smoothing capability it's much 254 | more general, i.e. it can generate sequences with any word from the 255 | vocabulary, even the phrases unseen before. At the same time it's much 256 | more computationally expensive, because now generating each new word 257 | takes `O(vocabulary size)` while it was `O(average number of words 258 | following any particular word)`. 259 | 260 | NLTK> (princ (generate *genesis* :order 2 :n 93)) 261 | burial to judged eaten sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung sprung foreign longed them ought up Temanites Aran earth earth blessings surface surface surface surface surface surface surface surface surface floated Darkness Now homage earth Now In princes said vengeance It passed said divide In beginning earth Asenath said re The peaceful kind Calah said blameless mistress Chaldees said hunter said middle surface surface surface surface yonder earth rib said said smoking smoking smoking 262 | 263 | And, as you see, this example totally doesn't resemble the one in the 264 | previous part. Is this a bug? No, just a trick that is played with us 265 | because we aren't following the basic math principles. In the Stupid 266 | Backoff model the probabilities don't add up to 1 and the conditional 267 | probability of an unseen ngrams may be larger than the largest 268 | probability of any recorded one! This is the reason we get to produce 269 | sequences of repeated words. This problem is much less obvious for the 270 | trigram model, although the text remains a complete gibberish. 271 | 272 | NLTK> (princ (generate *genesis* :order 3 :n 93)) 273 | brink time wagons fourth Besides south darkness listen foreigner Stay possessor lentils backwards be call dignity Kenizzites tar witness strained Yes appear colts bodies Reuel burn inheritance Galeed Hadar money touches conceal mighty foreigner spices Set pit straw son hurry yoke numbered gutters Dedan honest drove Magdiel Nod life assembly your Massa iniquity Tola still fifteen ascending wilderness everywhere shepherd harm bore Elah Jebusites Assyria butler Euphrates sinners gave Nephilim Stay garments find lifted communing closed Ir lights doing weeping shortly disobedience possessions drank peoples fifteen bless talked songs lamb far Shaveh heavens 274 | 275 | What this example shows us are at least two things: 276 | 277 | - we should always check that mathematical properties of our models 278 | still hold as we tweak them 279 | - although the major use-case for language model is scoring, you can 280 | get a feel of how good it will perform by looking at the texts it 281 | generates 282 | 283 | ## Finding collocations 284 | 285 | This is another interesting and useful NLP problem with a very elegant 286 | baseline solution, which is explained in this 287 | [article](http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html). 288 | Hopefully, we'll get back to it in more detail in the future chapters. 289 | And for now here's the results of implementing the algorithm from the article: 290 | 291 | NLTK> (collocations *inaugural*) 292 | (("United" "States") ("fellow" "citizens") ("four" "years") ("years" "ago") 293 | ("Federal" "Government") ("General" "Government") ("American" "people") 294 | ("Vice" "President") ("Old" "World") ("Almighty" "God") ("Fellow" "citizens") 295 | ("Chief" "Magistrate") ("Chief" "Justice") ("God" "bless") ("go" "forward") 296 | ("every" "citizen") ("Indian" "tribes") ("public" "debt") ("one" "another") 297 | ("foreign" "nations") ("political" "parties") ("State" "governments") 298 | ("National" "Government") ("United" "Nations") ("public" "money") 299 | ("national" "life") ("beloved" "country") ("upon" "us") ("fellow" "Americans") 300 | ("Western" "Hemisphere")) 301 | 302 | I'm surprised at how similar they are to NLTK's considering that I 303 | didn't look at their implementation. In fact, they are the same up to 304 | the difference in the list of __stopwords__ (the dirty secret of every 305 | NLP application :) The code for collocation extraction function can be 306 | found in [core/measures.lisp](https://github.com/vseloved/cl-nlp/blob/master/src/core/measures.lisp). 307 | 308 | ## Other uses of ngrams 309 | 310 | Ngrams are also sometimes used for individual characters to build 311 | Character language models. And here's another usage from NLTK — for 312 | counting word lengths. 313 | 314 | NLTK> (defvar *moby-lengths* 315 | (index-ngrams 1 (mapcar #'length (text-words *moby*)))) 316 | NLTK> (vocab *moby-lengths*) 317 | (1 4 2 6 8 9 11 5 7 3 10 12 13 14 16 15 17 18 20) 318 | NLTK> (ngrams-pairs *moby-lengths*) 319 | ((1 . 58368) (4 . 42273) (2 . 35726) (6 . 17111) (8 . 9966) (9 . 6428) 320 | (11 . 1873) (5 . 26595) (7 . 14399) (3 . 49633) (10 . 3528) (12 . 1053) 321 | (13 . 567) (14 . 177) (16 . 22) (15 . 70) (17 . 12) (18 . 1) (20 . 1)) 322 | NLTK> (ngrams-max-freq *moby-lengths*) 323 | 58368 324 | NLTK> (freq *moby-lengths* 3) 325 | 49633 326 | 327 | ## Final thoughts 328 | 329 | Language modeling is really the foundation of any serious NLP 330 | work. Having access to ngrams expands your possibilities immensely, 331 | but the problem with them is that moving from prototype to production 332 | implementation becomes tricky due to the problems of collecting a 333 | representative data-set and consequently efficiently storing it. 334 | Yet, there are solutions: the [Google Books Ngrams](http://storage.googleapis.com/books/ngrams/books/datasetsv2.html) 335 | and [Google Web1T](http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html) 336 | are an example of web-scale ngrams data-set, and there's also 337 | special-purpose software for storing large ngrams corpora and 338 | obtaining language models from them. The notable examples are 339 | [BerkeleyLM](http://code.google.com/p/berkeleylm/) and 340 | [KenLM](http://kheafield.com/code/kenlm/). 341 | -------------------------------------------------------------------------------- /nltk/ch1-1.md: -------------------------------------------------------------------------------- 1 | # NLTK - Computing with Language: Texts and Words 2 | 3 | OK, let's get started with the NLTK book. Its first chapter tries to 4 | impress the reader with how simple it is to accomplish some neat 5 | things with texts using it. Actually, the underlying algorithms that 6 | allow to achieve these results are mostly quite basic. We'll discuss 7 | them in this post and the code for the first part of the chapter can 8 | be found in [nltk/ch1-1.lisp](https://github.com/vseloved/cl-nlp/blob/master/nltk/ch1-1.lisp). 9 | 10 | ## Setting up texts for processing 11 | 12 | For the purpose of this demonstration we'll need several texts which 13 | can be downloaded from [NLTK data](http://nltk.org/nltk_data/). 14 | Namely, we'll use the following 5 texts: 15 | 16 | - Moby Dick (can be found inside Project Gutenberg) 17 | - Sense and Sensibility (likewise from Project Gutenberg) 18 | - The Book of Genesis 19 | - Inaugural Address Corpus (this one comes as a collection of separate 20 | texts, that you'll need to cat together into one file) 21 | - NPS Chat Corpus 22 | 23 | These texts are in `nltk/data/` directory in `CL-NLP`. 24 | 25 | NLTK guys have created a special `Text` class and have defined all the 26 | operations in this chapter as its methods. We'll employ a slightly 27 | simpler approach and implement them as ordinary functions. Yet we'll 28 | also have a special-purpose `text` class to cache reusable results of 29 | long-running operations, like tokenization. 30 | 31 | NLTK> (load-nltk-texts "data/") 32 | # 33 | # 34 | # 35 | ... 36 | 37 | As you've already guessed, we've just loaded all the texts. 38 | The number in the last column is each text's character count. 39 | 40 | Now they are stored in `*texts*` hash-table. 41 | This is how we can access an individual text and name them for future usage: 42 | 43 | (defparameter *sense* (get# :sense *texts*)) 44 | 45 | (`get#` is one of the shorthand functions for operating on hash-tables 46 | defined in [rutils](https://github.com/vseloved/rutils/blob/master/core/packages.lisp)) 47 | 48 | Now we have a variable pointing to "Sense and Sensibility". 49 | If we examine it, this is what we'll see: 50 | 51 | NLTK> (describe *sense*) 52 | # 53 | [standard-object] 54 | Slots with :INSTANCE allocation: 55 | NAME = :SENSE 56 | RAW = "[Sense and Sensibility by Jane Austen 1811].. 57 | WORDS = # 58 | CTXS = # 59 | TRANSITIONS = # 60 | DISPERSION = # 61 | 62 | As you see, there are some unbound slots in this structure: `words` 63 | will hold every word in the text after tokenization, `ctxs` will be a 64 | table of contexts for each word with their probabilities. By analogy, 65 | `transitons` will be a table of transition probabilities between 66 | words. Finally, `dispersion` will be a table of indices of word 67 | occurences in text. We'll use a lazy initialization strategy for them 68 | by defining `slot-unbound` CLOS methods, that will be called on first 69 | access to each slot. For example, here's how `words` is initialized: 70 | 71 | (defmethod slot-unbound (class (obj text) (slot (eql 'words))) 72 | (with-slots (raw words) obj 73 | (format t "~&Tokenizing text...~%") 74 | (prog1 (setf words (mapcan #`(cons "¶" (tokenize %)) 75 | (tokenize raw))) 76 | (format t "Number of words: ~A~%" (length words))))) 77 | 78 | First we split the raw text in paragraphs, because we'd like to 79 | preserve paragraph information. Splitting is slightly involved as 80 | paragraphs are separated by double newlines, while single newlines end 81 | every line in the text, and we have to distinguish this. We insert 82 | pillcrow signs paragraph boundaries. Then we tokenize the paragraphs 83 | into separate words (real words, punctuation marks, symbols, etc). 84 | 85 | NB. I consider tokenization the crucial function of the NLP toolkit, 86 | and we'll explore it in more detail in one of the future posts. 87 | 88 | 89 | ## Implementing the examples 90 | 91 | OK, now we are ready to start churning out examples from the first 92 | chapter. 93 | 94 | The first one finds occurences of certain words in the text. NLTK guys 95 | perform the search on the tokenized texts. But I think, it's quite OK 96 | to do it on raw strings with regexes. This has an added benefit of 97 | preserving text structure. 98 | 99 | NLTK> (concordance *moby* "monstrous") 100 | Displaying 11 of 11 matches 101 | former, one was of a most monstrous size. ... This came towards 102 | "Touching that monstrous bulk of the whale or ork we h 103 | array of monstrous clubs and spears. Some were 104 | you gazed, and wondered what monstrous 105 | has survived the flood; most monstrous 106 | monstrous fable, or still worse and mor 107 | Of the Monstrous Pictures of Whales. 108 | In connexion with the monstrous pictures of whales, I am stro 109 | o enter upon those still more monstrous stories of them 110 | ave been rummaged out of this monstrous 111 | Whale-Bones; for Whales of a monstrous size are 112 | 113 | With `:pass-newlines` on we can get the output similar to 114 | NLTK's. Let's try one of the homework tasks: 115 | 116 | NLTK> (concordance *genesis* "lived" :pass-newlines t) 117 | Displaying 75 of 75 matches 118 | t from Yahweh's presence, and lived in the land of Nod, east of 119 | when they were created. Adam lived one hundred thirty years, and 120 | ... 121 | 122 | Now let's try similarity. Here we won't do without proper tokenization. 123 | 124 | NLTK> (similar *moby* "monstrous") 125 | Building word contexts... 126 | Tokenizing text... 127 | Number of words: 267803 128 | Number of unique words: 19243 129 | ("mystifying" "subtly" "maddens" "impalpable" "modifies" "vexatious" "candid" 130 | "exasperate" "doleful" "delightfully" "trustworthy" "domineering" "abundant" 131 | "puzzled" "untoward" "contemptible" "gamesome" "reliable" "mouldy" 132 | "determined") 133 | 134 | NLTK> (similar *sense* "monstrous") 135 | Building word contexts... 136 | Tokenizing text... 137 | Number of words: 146926 138 | Number of unique words: 6783 139 | ("amazingly" "vast" "heartily" "extremely" "remarkably" "great" "exceedingly" 140 | "sweet" "very" "so" "good" "a" "as") 141 | 142 | We mostly get the same words as NLTK's result, but with a slightly 143 | different ordering. It turns out, that the reason for this is very 144 | simple. The function `similar` matches words based on the contexts, 145 | where they occur. According to the famous quote by John Rupert Firth: 146 | 147 | > You shall know a word by the company it keeps 148 | 149 | But if we look at context overlap between various words from our list 150 | we'll see that the similarity relation between all these words is 151 | extremely weak: the decision is based on the match of a single context 152 | in which both words appeared in text. In fact, all the listed words 153 | are similar to the same extent. 154 | 155 | NLTK> (common-contexts *moby* "monstrous" "loving") 156 | ("most_and") 157 | NLTK> (common-contexts *moby* "monstrous" "mystifying") 158 | ("most_and") 159 | NLTK> (apply #'common-contexts *moby* (similar *moby* "monstrous")) 160 | ("most_and") 161 | 162 | Actually, the next NLTK example is, probably, the best context overlap 163 | you can get from those texts: 164 | 165 | NLTK> (common-contexts *sense* "monstrous" "very") 166 | ("am_glad" "is_pretty" "a_pretty" "a_lucky") 167 | 168 | Now let's draw a dispersion plot of the words from inaugural corpus. 169 | This task may seem difficult to approach at first, because the authors 170 | use a Python library `matplotlib` for drawing the graph. Fortunately, 171 | there's a language-agnostic tool to achieve similar goals, which is 172 | called `gnuplot`. There is a couple of Lisp wrapper libraries for it, 173 | and the actual code you need to write to drive it amounts to 2 lines 174 | (not counting the code to format the data for consumption). There are, 175 | actually, numerous language-agnostic tools on the Unix platform — 176 | don't forget to look for them when you have such kind of specific need :) 177 | 178 | ![gnuplot dispersion graph with cl-nlp](http://img.photobucket.com/albums/v473/pufpuf/dispersion_zps7447117e.jpg) 179 | 180 | The next problem in this part also seems pretty hard. And, in fact, it 181 | is extremely hard if framed correctly — to generate a meaningful text 182 | based on some other text. But the example solves an easier task to 183 | generate _a somewhat_ meaningful text. And the approach taken to 184 | solve it is a very simple one — it is the baseline method in this area 185 | and is based on Markov chains. There was even a famous mock with 186 | Markov chains in the times of Usenet called [Mark V. Shaney](http://en.wikipedia.org/wiki/Mark_V_Shaney). 187 | Markov models have one principal parameter — _order_. 188 | Mark V. Shaney was an order 2 chain. 189 | 190 | Let's try to generate something with it: 191 | 192 | NLTK> (princ (generate *genesis* :order 2 :n 93)) 193 | Building prefix transitions index of length 2... 194 | Number of prefixes: 14326 195 | In the marsh grass . Behold , the male and female . Of these , and that Jacob obeyed his father . I will return to the earth was dry . God set them in the earth . Noah built an altar there , and put me in the pit ; and that which was our father ' s hand , then the handmaids and their sin is very grievous in your flesh for an everlasting covenant between God and every man to see the nakedness of his flock and keep it . 196 | 197 | And what if we raise the order? 198 | 199 | NLTK> (princ (generate *genesis* :order 3 :n 93)) 200 | Building prefix transitions index of length 3... 201 | Number of prefixes: 28206 202 | In the beginning God created the large sea creatures , and every bird , whatever moves on the earth . He stayed yet another seven days , and sent over that which he had gained in Paddan Aram . Esau saw that the interpretation was good , he said to them , they conspired against him to kill him . They took captive all their little ones , and for days and years ; and let it divide the waters from the waters . God said to the younger , and his seed 203 | 204 | 205 | The text starts to resemble the original more and more. Also you may 206 | notice, that the text will always start with "In". That's because 207 | Genesis isn't split in paragraphs, and our generation starts from 208 | paragraph beginnings, of which there's only one here. 209 | 210 | OK, this seems to work, but with probabilities you never know for sure... ;) 211 | 212 | Now, we're left with very simple tasks. Let's just do them: 213 | 214 | NLTK> (length (text-words *genesis*)) 215 | 44671 216 | 217 | In the book they had a slightly different number: 44764. This is 218 | because of the different tokenization scheme. The differences can be 219 | seen in the next snippet (we have a cleaner version for this use case :) 220 | 221 | NLTK> (take 20 (sort (remove-duplicates (text-words *genesis*) :test 'string=) 'string<)) 222 | ("!" "\"" "'" "(" ")" "," "-" "." ":" ";" "?" "A" "Abel" "Abida" "Abimael" 223 | "Abimelech" "About" "Abraham" "Abram" "Accad") 224 | 225 | What about the vocabulary size? Well, once again very similar to the 226 | NLTK number (2789). 227 | 228 | NLTK> (length (remove-duplicates (text-words *genesis*) :test 'string=)) 229 | 2634 230 | 231 | Now, let's look for words: 232 | 233 | NLTK> (count "smote" (text-words *genesis*) :test 'string=) 234 | 0 235 | 236 | Hmm... What about some other word? 237 | 238 | NLTK> (count "Abraham" (text-words *genesis*) :test 'string=) 239 | 134 240 | 241 | This seems to work. What's the problem with `"smote"`? Turns out, 242 | there's no such word in the Genesis text: at least the examination of 243 | the text doesn't show any traces of it. Looks like we've found a bug 244 | in the book :) 245 | 246 | (defun percentage (count total) 247 | (/ (* 100.0 count) total)) 248 | NLTK> (with-slots (words) *inaugural* 249 | (percentage (count "a" words :test 'string=) (length words))) 250 | 1.4597242 251 | 252 | (defun lexical-diversity (text) 253 | (with-slots (words) text 254 | (/ (float (length words)) 255 | (length (remove-duplicates words :test 'string=))))) 256 | NLTK> (lexical-diversity *genesis*) 257 | 16.959377 258 | NLTK> (lexical-diversity *chat*) 259 | 6.9837084 260 | 261 | Interestingly, the results for `*chat*` corpus differ from the NLTK 262 | ones, although they are calculated based on tokens, provided in the 263 | corpus and not extracted by our tokenization algorithms. This text is 264 | special, because it is extracted from the XML-structured document, 265 | which also contains the full tokenization. 266 | To use it we swap `words` in `*chat*` corpus: 267 | 268 | NLTK> (setf (text-words *chat*) 269 | (mapcar #'token-word 270 | (flatten (corpus-text-tokens ncorp:+nps-chat-corpus+)))) 271 | 272 | But first we need to get the corpus and extract the data from it — 273 | see `corpora/nps-chat.lisp` for details. 274 | 275 | And, finally, we can examine the Brown Corpus. 276 | 277 | Genre | Tokens | Types | Lexical Diversity 278 | --------------------+----------+---------+-------------------- 279 | PRESS-REPORTAGE | 100554 | 14393 | 7.0 280 | PRESS-EDITORIAL | 61604 | 9889 | 6.2 281 | PRESS-REVIEWS | 40704 | 8625 | 4.7 282 | RELIGION | 39399 | 6372 | 6.2 283 | SKILL-AND-HOBBIES | 82345 | 11934 | 6.9 284 | POPULAR-LORE | 110299 | 14502 | 7.6 285 | BELLES-LETTRES | 173096 | 18420 | 9.4 286 | MISCELLANEOUS-GOVER | 70117 | 8180 | 8.6 287 | LEARNED | 181888 | 16858 | 10.8 288 | FICTION-GENERAL | 68488 | 9301 | 7.4 289 | FICTION-MYSTERY | 57169 | 6981 | 8.2 290 | FICTION-SCIENCE | 14470 | 3232 | 4.5 291 | FICTION-ADVENTURE | 69342 | 8873 | 7.8 292 | FICTION-ROMANCE | 70022 | 8451 | 8.3 293 | HUMOR | 21695 | 5016 | 4.3 294 | 295 | 296 | OK, seems like we're done with this chapter. So far there was no 297 | rocket science involved, but it was interesting... 298 | 299 | ## Implementation details 300 | 301 | So, what are the interesting bits we haven't discussed? 302 | 303 | First, let's look at a small optimization trick for calculating 304 | `lexical-diversity`. Our initial variant uses a library function 305 | `remove-duplicates` which is highly inefficient for this case. 306 | 307 | NLTK> (time (lexical-diversity *chat*)) 308 | Evaluation took: 309 | 9.898 seconds of real time 310 | 9.888618 seconds of total run time (9.888618 user, 0.000000 system) 311 | 99.91% CPU 312 | 23,687,560,947 processor cycles 313 | 229,392 bytes consed 314 | 315 | What we'd like to do is something similar to the Python's version 316 | which puts everything in a set and calculates its size. 317 | A set is easily represented with a hash-table: 318 | 319 | (defun uniq (list &key raw case-insensitive) 320 | "Return only unique elements from LIST either as a new list 321 | or as hash-table if RAW is set. Can be CASE-INSENSITIVE." 322 | (let ((uniqs (make-hash-table :test (if case-insensitive 'equalp 'equal)))) 323 | (dolist (elt list) 324 | (set# elt uniqs t)) 325 | (if raw uniqs (ht-keys uniqs)))) 326 | 327 | Here's the time of the same calculation using `uniq`: 328 | 329 | NLTK> (time (lexical-diversity *chat*)) 330 | Evaluation took: 331 | 0.014 seconds of real time 332 | 0.012001 seconds of total run time (0.012001 user, 0.000000 system) 333 | 85.71% CPU 334 | 33,396,336 processor cycles 335 | 613,568 bytes consed 336 | 337 | A 1000x speed increase! 338 | 339 | Now, let's return to text generation. It is accomplished with the 340 | following loop (a simplified version): 341 | 342 | (loop :for i :from 1 :to length :do 343 | (let ((r (random 1.0)) 344 | (total 0)) 345 | (dotable (word prob 346 | (or (get# prefix transitions) 347 | ;; no continuation - start anew 348 | (prog1 (get# (setf prefix initial-prefix) transitions) 349 | ;; add period unless one is already there 350 | (unless (every #'period-char-p (car rez)) 351 | (push "." rez) 352 | (incf i))))) 353 | (when (> (incf total prob) r) 354 | (push word rez) 355 | (setf prefix (cons word (butlast prefix))) 356 | (return))))) 357 | 358 | On each iteration it places all possible continuations of the current 359 | prefix on a segment from 0 to 1 and generates a random number that 360 | points to one of the variants. If there's no continuation it starts 361 | anew. 362 | 363 | NLTK book, actually, uses a slightly more complicated model: first, it 364 | builds a probability distribution on top of the transition frequencies 365 | and then generated the text from the probabilities. As of now I don't 366 | see why this is needed and if it makes any difference in the results. 367 | 368 | And, finally, here's how we draw the dispersion plot: 369 | 370 | (defun plot-dispersion (words file) 371 | "Plot WORDS dispersion data from FILE." 372 | (cgn:start-gnuplot) 373 | (cgn:format-gnuplot "set title \"Lexical Dispersion Plot\"") 374 | (cgn:format-gnuplot "plot \"~A\" using 1:2:yticlabels(3) title \"\"" file)) 375 | 376 | It's just 1 line of `gnuplot` code, actually, but we also need to 377 | prepare the data in a tab-separated text file: 378 | 379 | (defun dump-data (words dispersion-table) 380 | "Dump data from DISPERSION-TABLE for WORDS into a temporary file 381 | and return its name." 382 | (let ((filename (fmt "/tmp/~A" (gensym)))) 383 | (with-out-file (out filename) 384 | (format out "0~t0~t~%") 385 | (doindex (i word words) 386 | (dolist (idx (get# word dispersion-table)) 387 | (format out "~A~t~A~t~A~%" idx (1+ i) word))) 388 | (format out "0~t~A~t~%" (1+ (length words)))) 389 | filename)) 390 | 391 | To wrap up, we've seen a demonstration of a lot of useful tools for 392 | text processing, and also discussed how they can be built. Among all 393 | of them I want to outline the utility of a seemingly simplistic 394 | `concordance` that is actually kind of a `grep` tool that is 395 | indispensable for any text exploration. I even used it a couple of 396 | times debugging issues in more complex functions from this pack. 397 | --------------------------------------------------------------------------------