├── LICENSE ├── Makefile ├── README.md ├── base-forms.lisp ├── chunker.lisp ├── cluster.lisp ├── compile.lisp ├── corpora.lisp ├── cross-validation.lisp ├── data ├── contractions.txt └── user-lexicon.txt ├── db-methods.lisp ├── db.lisp ├── edit-distance.lisp ├── english-lexicon.lisp ├── english.lisp ├── french-lexicon.lisp ├── french.lisp ├── fuzzy.lisp ├── german.lisp ├── globals.lisp ├── grammar.lisp ├── initialize.lisp ├── italian.lisp ├── kyoto-lexicon.lisp ├── lexicon.lisp ├── lgrammar.lisp ├── lib ├── cffi-wordnet │ ├── LICENSE │ ├── README │ ├── cffi-wordnet.asd │ ├── cffi.lisp │ ├── constants.lisp │ ├── contrib │ │ └── macos │ │ │ └── Makefile │ ├── wordnet-package.lisp │ ├── wordnet.lisp │ └── wordnet.swig ├── graph-utils │ ├── LICENSE │ ├── README │ ├── TODO │ ├── bipartite.lisp │ ├── cut.lisp │ ├── data │ │ ├── Dining-cut.dot │ │ ├── Dining-cut.png │ │ ├── Dining.dot │ │ ├── Dining.log │ │ ├── Dining.net │ │ ├── Dining.png │ │ ├── adjnoun-cut.dot │ │ ├── adjnoun-cut.png │ │ ├── adjnoun.dot │ │ ├── adjnoun.gml │ │ ├── adjnoun.log │ │ ├── adjnoun.png │ │ ├── adjnoun.txt │ │ ├── bipartite.dot │ │ ├── bipartite.net │ │ ├── bipartite1.dot │ │ ├── bipartite1.net │ │ ├── dolphins-cut.dot │ │ ├── dolphins-cut.png │ │ ├── dolphins.dot │ │ ├── dolphins.gml │ │ ├── dolphins.log │ │ ├── dolphins.png │ │ ├── dolphins.txt │ │ ├── flow-big.net │ │ ├── flow-simple.dot │ │ ├── flow-simple.net │ │ ├── flow.dot │ │ ├── flow.net │ │ ├── flow1.dot │ │ ├── flow1.net │ │ ├── football-cut.dot │ │ ├── football-cut.png │ │ ├── football.dot │ │ ├── football.gml │ │ ├── football.log │ │ ├── football.png │ │ ├── football.txt │ │ ├── karate-cut.dot │ │ ├── karate-cut.png │ │ ├── karate.dot │ │ ├── karate.gml │ │ ├── karate.log │ │ ├── karate.png │ │ ├── karate.txt │ │ ├── matching.dot │ │ ├── netscience.gml │ │ ├── netscience.txt │ │ ├── netscience.zip │ │ └── test.net │ ├── edge.lisp │ ├── edit-distance.lisp │ ├── fib-heap.lisp │ ├── functor.lisp │ ├── graph-class.lisp │ ├── graph-generation.lisp │ ├── graph-methods.lisp │ ├── graph-package.lisp │ ├── graph-test.lisp │ ├── graph-utils.asd │ ├── graph-visualization.lisp │ ├── index.lisp │ ├── maximum-flow.lisp │ ├── node.lisp │ ├── parsers.lisp │ ├── prolog-functors.lisp │ ├── prologc.lisp │ ├── queue.lisp │ ├── r-test.lisp │ ├── sparse-arrays.lisp │ ├── tests.lisp │ ├── triples.lisp │ ├── typed-edge-graph-class.lisp │ ├── typed-edge-graph-methods.lisp │ └── utilities.lisp └── porter-stemmer │ ├── package.lisp │ ├── porter-stemmer.asd │ └── stemmer.lisp ├── libstemmer.lisp ├── ngrams.lisp ├── nlp-package.lisp ├── nlp.asd ├── parser.lisp ├── parsing.lisp ├── pcfg.lisp ├── pcp.lisp ├── portuguese.lisp ├── pos-map.lisp ├── pos-symbols.lisp ├── pos-tag.lisp ├── prob-parser.lisp ├── prolog ├── prologdb.5 ├── prologdb.5.pdf ├── prologdb.5.ps ├── prologdb.5WN.html ├── senseidx.5 ├── senseidx.5.pdf ├── senseidx.5.ps ├── senseidx.5WN.html ├── wn_ant.pl ├── wn_at.pl ├── wn_cls.pl ├── wn_cs.pl ├── wn_der.pl ├── wn_ent.pl ├── wn_fr.pl ├── wn_g.pl ├── wn_hyp.pl ├── wn_ins.pl ├── wn_mm.pl ├── wn_mp.pl ├── wn_ms.pl ├── wn_per.pl ├── wn_ppl.pl ├── wn_s.pl ├── wn_sa.pl ├── wn_sim.pl ├── wn_sk.pl ├── wn_syntax.pl ├── wn_vgp.pl ├── wndb.5 ├── wndb.5.pdf ├── wndb.5.ps ├── wndb.5WN.html ├── wngroups.7 ├── wngroups.7.pdf ├── wngroups.7.ps ├── wngroups.7WN.html ├── wninput.5 ├── wninput.5.pdf ├── wninput.5.ps ├── wninput.5WN.html ├── wnpkgs.7 ├── wnpkgs.7.pdf ├── wnpkgs.7.ps └── wnpkgs.7WN.html ├── read-ancora-es.lisp ├── read-cess-esp.lisp ├── read-floresta.lisp ├── read-paisa.lisp ├── read-tiger.lisp ├── read-tut.lisp ├── read-wikicorpus-es.lisp ├── semantic-similarity-li.lisp ├── semantic-similarity-raison.lisp ├── semantic-similarity.lisp ├── sentence-splitter.lisp ├── spanish-lexicon.lisp ├── spanish-stemmer.lisp ├── spanish.lisp ├── stemmer.lisp ├── stop-words.lisp ├── train-english.lisp ├── train-french.lisp ├── train-german.lisp ├── train-italian.lisp ├── train-portuguese.lisp ├── train-spanish.lisp ├── utilities.lisp ├── wiki-es.lisp └── wordnet.lisp /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Kevin Thomas Raison 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | 21 | Except as contained in this notice, the name(s) of the above copyright holders 22 | shall not be used in advertising or otherwise to promote the sale, use or other 23 | dealings in this Software without prior written authorization. 24 | 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # NLP Service Makefile 2 | 3 | CC=gcc 4 | LISP=sbcl 5 | DEPS_STATUS=deps.status 6 | 7 | all: deps train-all 8 | 9 | data: 10 | aws s3 cp s3://answerdash.resources/cl-nlp/data/ ./data/ --recursive 11 | 12 | deps: 13 | sudo apt-get install -y -f --force-yes wordnet graphviz ed libstemmer-dev libstemmer-tools libstemmer0d libkyotocabinet-dev libkyotocabinet16 14 | sudo rm -f /usr/lib/libwordnet.so 15 | sudo ln -s /usr/lib/libwordnet-3.0.so /usr/lib/libwordnet.so 16 | echo "ok" > $(DEPS_STATUS) 17 | 18 | train-all: clean deps train-english train-german train-spanish train-italian train-portuguese train-french 19 | tar czf languages.tar.gz \ 20 | english.dat english-*.kch \ 21 | german.dat german-*.kch \ 22 | spanish.dat spanish-*.kch \ 23 | french.dat french-*.kch \ 24 | portuguese.dat portuguese-*.kch \ 25 | italian.dat italian-*.kch 26 | 27 | train-english: deps 28 | $(LISP) --dynamic-space-size 16384 --non-interactive --load train-english.lisp 29 | 30 | train-german: deps 31 | $(LISP) --dynamic-space-size 16384 --non-interactive --load train-german.lisp 32 | 33 | train-spanish: deps 34 | $(LISP) --dynamic-space-size 16384 --non-interactive --load train-spanish.lisp 35 | 36 | train-italian: deps 37 | $(LISP) --dynamic-space-size 16384 --non-interactive --load train-italian.lisp 38 | 39 | train-portuguese: deps 40 | $(LISP) --dynamic-space-size 16384 --non-interactive --load train-portuguese.lisp 41 | 42 | train-french: deps 43 | $(LISP) --dynamic-space-size 16384 --non-interactive --load train-french.lisp 44 | 45 | test: 46 | $(LISP) --dynamic-space-size 16384 --non-interactive --load test.lisp 47 | 48 | push: train-all 49 | aws s3 cp languages.tar.gz s3://answerdash.resources/cl-nlp/languages.tar.gz 50 | 51 | clean: 52 | $(RM) $(DEPS_STATUS) 53 | $(RM) english.dat english*.kch 54 | $(RM) german.dat german*.kch 55 | $(RM) portuguese.dat portuguese*.kch 56 | $(RM) spanish.dat spanish*.kch 57 | $(RM) italian.dat italian*.kch 58 | $(RM) french.dat french*.kch 59 | $(RM) languages.tar.gz 60 | 61 | # DO NOT DELETE THIS LINE -- make depend needs it 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## NLP for common lisp 2 | 3 | The following system libraries are required to use this package: 4 | 5 | * kyotocabinet 6 | * wordnet 3.0 7 | * r-base 8 | * ed 9 | * graphviz 10 | * libstemmer-dev 11 | * libstemmer-tools 12 | * libstemmer0d 13 | 14 | Lisp library dependencies that are not available via quicklisp are in the 15 | lib/ directory 16 | 17 | This version supports English, French, German, Portuguese, Spanish and Italian. 18 | 19 | Pre-trained data sets are available at http://chatsubo.net/~raison/languages.tar.gz 20 | 21 | I can make the various corpora available for retraining on an as-needed basis. 22 | -------------------------------------------------------------------------------- /base-forms.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defun singularize (word) 4 | "Singularize a word using Wordnet. Assumes you have already called 5 | (cffi-wordnet:wordnet-init)" 6 | (let ((singular (cffi-wordnet:morph-word word cffi-wordnet:+noun+))) 7 | (or singular word))) 8 | 9 | (defun verb-base-form (verb) 10 | "Find base form of the verb. Assumes you have already called 11 | (cffi-wordnet:wordnet-init)" 12 | (let ((base (cffi-wordnet:morph-word verb cffi-wordnet:+verb+))) 13 | (or base verb))) 14 | -------------------------------------------------------------------------------- /compile.lisp: -------------------------------------------------------------------------------- 1 | (push #P"./lib/cffi-wordnet/" asdf:*central-registry*) 2 | (push #P"./lib/cl-daemonize/" asdf:*central-registry*) 3 | (push #P"./lib/graph-utils/" asdf:*central-registry*) 4 | (push #P"./lib/porter-stemmer/" asdf:*central-registry*) 5 | (push #P"./" asdf:*central-registry*) 6 | 7 | (ql:quickload :nlp) 8 | (quit 0) 9 | -------------------------------------------------------------------------------- /data/contractions.txt: -------------------------------------------------------------------------------- 1 | ain't am not 2 | aint am not 3 | aren't are not 4 | arent are not 5 | can't can not 6 | cant can not 7 | could've could have 8 | couldve could have 9 | couldn't could not 10 | couldnt could not 11 | couldn't've could not have 12 | didn't did not 13 | didnt did not 14 | doesn't does not 15 | doesnt does not 16 | don't do not 17 | dont do not 18 | hadn't had not 19 | hadnt had not 20 | hadn't've had not have 21 | hasn't has not 22 | hasnt has not 23 | haven't have not 24 | havent have not 25 | he'd've he would have 26 | he'll he will 27 | how'll how will 28 | howll how will 29 | I'd've I would have 30 | I'll I will 31 | I'm I am 32 | I've I have 33 | isn't is not 34 | isnt is not 35 | it'd've it would have 36 | it'll it will 37 | let's let us 38 | lets let us 39 | ma'am madam 40 | mightn't might not 41 | mightn't've might not have 42 | might've might have 43 | mustn't must not 44 | must've must have 45 | needn't need not 46 | not've not have 47 | oughtn't ought not 48 | 'ow's'at how is that 49 | shan't shall not 50 | she'd've she would have 51 | she'll she will 52 | should've should have 53 | shouldve should have 54 | shouldn't should not 55 | shouldnt should not 56 | shouldn't've should not have 57 | somebody'd've somebody would have 58 | somebody'll somebody will 59 | someone'd've someone would have 60 | someone'll someone will 61 | something'd've something would have 62 | that'll that will 63 | there'd've there would have 64 | there're there are 65 | they'd've they would have 66 | they'll they will 67 | they're they are 68 | they've they have 69 | 'twas it was 70 | wasn't was not 71 | we'd've we would have 72 | we'll we will 73 | we're we are 74 | we've we have 75 | weren't were not 76 | what'll what will 77 | what're what are 78 | what've what have 79 | where'd where did 80 | where've where have 81 | who'd've who would have 82 | who'll who will 83 | who're who are 84 | who've who have 85 | why'll why will 86 | why're why are 87 | won't will not 88 | would've would have 89 | wouldve would have 90 | wouldn't would not 91 | wouldnt would not 92 | wouldn't've would not have 93 | y'all you all 94 | you'd've you would have 95 | you'll you will 96 | you're you are 97 | you've you have 98 | -------------------------------------------------------------------------------- /db-methods.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defmethod print-object ((language language) stream) 4 | (print-unreadable-object (language stream) 5 | (format stream 6 | "~A (~A): ~A bigrams, ~A trigrams ~A observation likelihoods" 7 | (type-of language) 8 | (name language) 9 | (hash-table-count (bigrams language)) 10 | (hash-table-count (trigrams language)) 11 | (hash-table-count (observations language))))) 12 | 13 | (defgeneric language-p (language) 14 | (:method ((language language)) t) 15 | (:method (thing) nil)) 16 | 17 | (defmethod part-of-speech-tags ((language language)) 18 | (alexandria:hash-table-keys (unigrams language))) 19 | 20 | (defmethod add-language ((language language)) 21 | (setf (gethash (name language) *languages*) language) 22 | (dolist (alias (aliases language)) 23 | (setf (gethash alias *languages*) language)) 24 | language) 25 | 26 | (defun lookup-language (name) 27 | (gethash name *languages*)) 28 | 29 | (defmethod set-language-finalizers ((language language)) 30 | "Make sure the Kyoto Cabinet references are properly closed if this instance 31 | is garbage collected." 32 | (let* ((lexicon-dbm (lexicon-dbm language)) 33 | (plexicon-dbm (plexicon-dbm language)) 34 | (observations-dbm (observations-dbm language)) 35 | (word-occurrence-dbm (word-occurrence-dbm language))) 36 | #+sbcl 37 | (sb-ext:finalize language 38 | (lambda () 39 | (when (typep word-occurrence-dbm 'kc-dbm) 40 | (dbm-close word-occurrence-dbm)) 41 | (when (typep observations-dbm 'kc-dbm) 42 | (dbm-close observations-dbm)) 43 | (when (typep lexicon-dbm 'kc-dbm) 44 | (dbm-close lexicon-dbm)) 45 | (when (typep plexicon-dbm 'kc-dbm) 46 | (dbm-close plexicon-dbm)))) 47 | language)) 48 | 49 | (defmethod make-new-language ((class symbol)) 50 | "Constructor that takes care of creating language objects and lexicon dbms" 51 | (let ((language (make-instance class))) 52 | (make-word-occurrence-dbm language) 53 | (make-observations-dbm language) 54 | (make-lexicon-dbm language) 55 | (make-plexicon-dbm language) 56 | (set-language-finalizers language) 57 | language)) 58 | 59 | (defmethod close-language ((language language)) 60 | (close-word-occurrence-dbm language) 61 | (close-observations-dbm language) 62 | (close-lexicon-dbm language) 63 | (close-plexicon-dbm language) 64 | (remhash (name language) *languages*) 65 | (dolist (alias (aliases language)) 66 | (remhash alias *languages*))) 67 | 68 | (defmethod noun-p ((language language) (word string)) 69 | (some (lambda (pos-tag) 70 | (noun-p language pos-tag)) 71 | (nlp:lookup-pos language word))) 72 | 73 | (defmethod adjective-p ((language language) (word string)) 74 | (some (lambda (pos-tag) 75 | (adjective-p language pos-tag)) 76 | (nlp:lookup-pos language word))) 77 | 78 | (defmethod verb-p ((language language) (word string)) 79 | (some (lambda (pos-tag) 80 | (verb-p language pos-tag)) 81 | (nlp:lookup-pos language word))) 82 | 83 | (defmethod adverb-p ((language language) (word string)) 84 | (some (lambda (pos-tag) 85 | (adverb-p language pos-tag)) 86 | (nlp:lookup-pos language word))) 87 | 88 | (defmethod determiner-p ((language language) (word string)) 89 | (some (lambda (pos-tag) 90 | (determiner-p language pos-tag)) 91 | (nlp:lookup-pos language word))) 92 | 93 | (defmethod pronoun-p ((language language) (word string)) 94 | (some (lambda (pos-tag) 95 | (pronoun-p language pos-tag)) 96 | (nlp:lookup-pos language word))) 97 | -------------------------------------------------------------------------------- /db.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defvar *languages* (make-hash-table :test 'equalp)) 4 | 5 | (defclass language () 6 | ((name :accessor name :initform "" :initarg :name) 7 | (aliases :accessor aliases :initform nil :initarg :aliases) 8 | (default-encoding :accessor default-encoding :initform nil 9 | :initarg :default-encoding) 10 | (alphabet :accessor alphabet :initform nil :initarg :alphabet) 11 | 12 | ;; POS tagging 13 | (total-count :accessor total-count :initform 0) 14 | (gammas :accessor gammas :initform (make-hash-table)) 15 | (probabilities :accessor probabilities :initform (make-hash-table :test 'equal)) 16 | (unigrams :accessor unigrams :initform (make-hash-table :test 'equal)) 17 | (bigrams :accessor bigrams :initform (make-hash-table :test 'equal)) 18 | (trigrams :accessor trigrams :initform (make-hash-table :test 'equal)) 19 | (tag-occurrences :accessor tag-occurrences 20 | :initform (make-hash-table :test 'equal)) 21 | (word-occurrences :accessor word-occurrences 22 | :initform (make-hash-table :test 'equalp 23 | :weakness :key-or-value)) 24 | (word-occurrence-file :accessor word-occurrence-file :initform "" 25 | :initarg :word-occurrence-file) 26 | (word-occurrence-dbm :accessor word-occurrence-dbm :initform nil 27 | :initarg :word-occurrence-dbm) 28 | 29 | (observations :accessor observations 30 | :initform (make-hash-table :test 'equal 31 | :weakness :key-or-value)) 32 | (observations-file :accessor observations-file :initform "" 33 | :initarg :observations-file) 34 | (observations-dbm :accessor observations-dbm :initform nil 35 | :initarg :observations-dbm) 36 | 37 | (unknown-probability :accessor unknown-probability :initform 0) 38 | 39 | ;; HMM Chunking 40 | (np-total-count :accessor np-total-count :initform 0) 41 | (np-gammas :accessor np-gammas :initform (make-hash-table)) 42 | (np-probabilities :accessor np-probabilities 43 | :initform (make-hash-table :test 'equal)) 44 | (np-unigrams :accessor np-unigrams :initform (make-hash-table :test 'equal)) 45 | (np-bigrams :accessor np-bigrams :initform (make-hash-table :test 'equal)) 46 | (np-trigrams :accessor np-trigrams :initform (make-hash-table :test 'equal)) 47 | (np-tag-occurrences :accessor np-tag-occurrences 48 | :initform (make-hash-table :test 'equal)) 49 | (np-pos-occurrences :accessor np-pos-occurrences 50 | :initform (make-hash-table :test 'equal)) 51 | (np-observations :accessor np-observations 52 | :initform (make-hash-table :test 'equal)) 53 | (np-unknown-probability :accessor np-unknown-probability :initform 0) 54 | 55 | ;; Lexicon 56 | (lexicon-file :accessor lexicon-file :initform "" :initarg :lexicon-file) 57 | (lexicon-dbm :accessor lexicon-dbm :initform nil :initarg :lexicon-dbm) 58 | (plexicon-file :accessor plexicon-file :initform "" :initarg :plexicon-file) 59 | (plexicon-dbm :accessor plexicon-dbm :initform nil :initarg :plexicon-dbm) 60 | 61 | (vowels :accessor vowels :initform nil :initarg :vowels) 62 | 63 | (lexicon :accessor lexicon :initform (make-hash-table :test 'equal)) 64 | (plexicon :accessor plexicon :initform (make-hash-table :test 'equal)) 65 | (word-freq :accessor word-freq :initform (make-hash-table :test 'equal)) 66 | 67 | (user-pos-regex :accessor user-pos-regex :initform nil) 68 | (contraction-table :accessor contraction-table 69 | :initform (make-hash-table :test 'equalp)) 70 | (stop-words :accessor stop-words :initform (make-hash-table :test 'equalp)) 71 | (ngrams :accessor ngrams :initform (make-hash-table :test 'equal)) 72 | 73 | ;; Wordnet 74 | (synset-table :accessor synset-table :initform (make-hash-table)) 75 | (word-to-synset-table :accessor word-to-synset-table 76 | :initform (make-hash-table :test 'equalp)) 77 | (word-sense-to-synset-table :accessor word-sense-to-synset-table 78 | :initform (make-hash-table :test 'equalp)) 79 | (word-pos-to-synset-table :accessor word-pos-to-synset-table 80 | :initform (make-hash-table :test 'equalp)) 81 | 82 | ;; Parsing 83 | (np-regexes :accessor np-regexes :initform nil) 84 | (cfg :accessor cfg :initform (make-hash-table)) 85 | (cfg-idx :accessor cfg-idx :initform (make-hash-table)) 86 | (pcfg :accessor pcfg :initform (make-hash-table :test 'equalp)) 87 | (lcfg :accessor lcfg :initform nil) ;;(make-lcfg-table)) 88 | (cnf-grammar :accessor cnf-grammar :initform (make-hash-table)) 89 | (cnf-index :accessor cnf-index :initform (make-hash-table :test 'equalp)) 90 | (cnf-subs-map :accessor cnf-subs-map 91 | :initform (make-hash-table :test 'equalp)) 92 | (cnf-subs-rev :accessor cnf-subs-rev 93 | :initform (make-hash-table :test 'equalp)))) 94 | -------------------------------------------------------------------------------- /edit-distance.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:nlp) 2 | 3 | (defun edit-distance (s1 s2 &key (insert-cost 1) (delete-cost 1) (sub-cost 2)) 4 | "Basic edit distance algorithm with static costs" 5 | (let* ((s1-length (length s1)) 6 | (s2-length (length s2)) 7 | (distance (make-array `(,(1+ s1-length) ,(1+ s2-length)) 8 | :initial-element 0))) 9 | (loop for i from 1 to s1-length do 10 | (setf (aref distance i 0) (+ (aref distance (1- i) 0) insert-cost))) 11 | (loop for j from 1 to s2-length do 12 | (setf (aref distance 0 j) (+ (aref distance 0 (1- j)) delete-cost))) 13 | (loop for i from 1 to s1-length do 14 | (loop for j from 1 to s2-length do 15 | (setf (aref distance i j) 16 | (min (+ (aref distance (1- i) j) insert-cost) 17 | (+ (aref distance i (1- j)) delete-cost) 18 | (+ (aref distance (1- i) (1- j)) 19 | (if (eql (elt s1 (1- i)) (elt s2 (1- j))) 20 | 0 21 | sub-cost)))))) 22 | ;;(values (aref distance s1-length s2-length) distance))) 23 | (aref distance s1-length s2-length))) 24 | 25 | (defun lcs-backtrack (matrix s1 s2 i j &key (equality-fn 'char-equal)) 26 | (cond ((or (= -1 i) (= -1 j)) 27 | nil) 28 | ((funcall equality-fn (elt s1 i) (elt s2 j)) 29 | (nconc (lcs-backtrack matrix s1 s2 (1- i) (1- j)) 30 | (list (elt s1 i)))) 31 | ((> (aref matrix i (1- j)) 32 | (aref matrix (1- i) j)) 33 | (lcs-backtrack matrix s1 s2 i (1- j))) 34 | (t 35 | (lcs-backtrack matrix s1 s2 (1- i) j)))) 36 | 37 | (defun longest-common-subseq (s1 s2 &key (equality-fn 'char-equal)) 38 | (let* ((s1-length (length s1)) 39 | (s2-length (length s2)) 40 | (matrix (make-array (list s1-length s2-length) 41 | :element-type 'integer 42 | :initial-element 0))) 43 | (loop for i from 0 below s1-length do 44 | (loop for j from 0 below s2-length do 45 | (if (funcall equality-fn (elt s1 i) (elt s2 j)) 46 | (setf (aref matrix i j) 47 | (1+ (if (or (zerop i) (zerop j)) 48 | 0 49 | (aref matrix (1- i) (1- j))))) 50 | (setf (aref matrix i j) 51 | (max (if (zerop j) 0 (aref matrix i (1- j))) 52 | (if (zerop i) 0 (aref matrix (1- i) j))))))) 53 | (values (lcs-backtrack matrix 54 | s1 s2 55 | (1- s1-length) (1- s2-length) 56 | :equality-fn equality-fn) 57 | (aref matrix (1- s1-length) (1- s2-length))))) 58 | -------------------------------------------------------------------------------- /english-lexicon.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (let ((verb-regex (create-scanner 4 | "\\w(ing|ate|ify|ize|ise|ed)$" 5 | :single-line-mode t :case-insensitive-mode t)) 6 | (adj-regex (create-scanner 7 | "\\w(able|ible|al|ial|ic|y|ing|ed|ful|ish|ive|ous|ious)$" 8 | :single-line-mode t :case-insensitive-mode t)) 9 | (adv-regex (create-scanner 10 | "\\w(ly|ally|ily)$" 11 | :single-line-mode t :case-insensitive-mode t)) 12 | (noun-regex 13 | (create-scanner 14 | "\\w(er|or|ance|ence|ant|ent|ee|ess|ian|ism|ics|ist|ity|ment|ness|ship|tion|ation|ure|man|woman|eur|ing|hood)$" 15 | :single-line-mode t :case-insensitive-mode t)) 16 | (pnoun-regex (create-scanner 17 | "^[A-Z]\\w" :single-line-mode t :case-insensitive-mode nil))) 18 | 19 | (defmethod in-lexicon-p ((language english) word pos) 20 | "Is word as pos in the lexicon?" 21 | (when (symbolp word) (setq word (symbol-name word))) 22 | (or (member pos (lookup-pos-dbm language word)) 23 | (member pos (lookup-pos-dbm language (string-downcase word))) 24 | (and (eq pos :CD) (scan *number-regex* word)))) 25 | 26 | (defmethod lookup-pos ((language english) word) 27 | "Return all possible parts of speech for word" 28 | (let ((pos-list (or (lookup-pos-dbm language word) 29 | (lookup-pos-dbm language (string-downcase word))))) 30 | (when (or (scan *number-regex* word) 31 | (hex-number-p word)) 32 | (pushnew :CD pos-list)) 33 | (remove-duplicates 34 | (remove-if 35 | 'null 36 | (append 37 | (or pos-list 38 | (append 39 | (and (scan noun-regex word) (list :NN)) 40 | (and (scan *host-regex* word) (list :NNP)) 41 | (and (scan *ip-regex* word) (list :NN)) 42 | (and (scan pnoun-regex word) (list :NNP)) 43 | (and (scan verb-regex word) (list :VB)) 44 | (and (scan adv-regex word) (list :RB)) 45 | (and (scan adj-regex word) (list :JJ)))) 46 | (mapcan (lambda (pair) 47 | (and (scan (car pair) word) (list (cdr pair)))) 48 | (user-pos-regex language)))))))) 49 | -------------------------------------------------------------------------------- /french-lexicon.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (let ((number-regex 4 | (create-scanner 5 | "^[£\\%\\#±¥\\$\\+]?[½¾0123456789\\-./,:°º\\+]+_?[\\%°£$¥]?$" 6 | :single-line-mode t)) 7 | (infinitive-regex (create-scanner 8 | "\\w(er|ir|re)$" 9 | :single-line-mode t :case-insensitive-mode t)) 10 | (verb-regex (create-scanner 11 | "\\w(e|es|ons|ez|ent|is|it|issons|issez|issent)$" 12 | :single-line-mode t :case-insensitive-mode t)) 13 | (adv-regex (create-scanner "\\w(ment)$" 14 | :single-line-mode t :case-insensitive-mode t)) 15 | (noun-regex 16 | (create-scanner 17 | "\\w()$" 18 | :single-line-mode t :case-insensitive-mode t)) 19 | (pnoun-regex (create-scanner "^[A-Z]\\w" 20 | :single-line-mode t 21 | :case-insensitive-mode nil))) 22 | 23 | (defun french-number-p (text) 24 | (scan number-regex text)) 25 | 26 | (defmethod in-lexicon-p ((language french) word pos) 27 | "Is word / part-of-speech pair in the lexicon?" 28 | (when (symbolp word) (setq word (symbol-name word))) 29 | (or (member pos (lookup-pos-dbm language word)) 30 | (member pos (lookup-pos-dbm language (string-downcase word))) 31 | (and (or (eq pos :NC) 32 | (scan number-regex word))))) 33 | 34 | (defmethod lookup-pos ((language french) word) 35 | "Return all possible parts of speech for word" 36 | (let ((pos-list (or (lookup-pos-dbm language word) 37 | (lookup-pos-dbm language (string-downcase word))))) 38 | (when (scan number-regex word) 39 | (pushnew :NC pos-list)) 40 | (remove-duplicates 41 | (remove-if 42 | 'null 43 | (append 44 | (or pos-list 45 | (append 46 | (and (scan noun-regex word) (list :NC :NPP)) 47 | (and (scan *host-regex* word) (list :NPP :NC)) 48 | (and (scan *ip-regex* word) (list :NC :NPP)) 49 | (and (scan pnoun-regex word) (list :NPP :NC)) 50 | (and (scan verb-regex word) (list :V)) 51 | (and (scan infinitive-regex word) (list :VINF)) 52 | (and (scan adv-regex word) (list :ADV)))) 53 | (mapcan (lambda (pair) 54 | (and (scan (car pair) word) (list (cdr pair)))) 55 | (user-pos-regex language)))))))) 56 | -------------------------------------------------------------------------------- /french.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defclass french (language) 4 | ((name :accessor name :initform "french" :initarg :name) 5 | (aliases :accessor aliases 6 | :initform '("fr" "fr-be" "fr-ca" "fr-fr" "fr-lu" "fr-ch") 7 | :initarg :aliases) 8 | (vowels :accessor vowels :initform 9 | '(#\a #\e #\i #\o #\u #\à #\è #\é #\ì #\í #\î #\ò #\ó #\ù #\ú) 10 | :initarg :vowels) 11 | (default-encoding :accessor default-encoding :initform :utf-8 12 | :initarg :default-encoding))) 13 | 14 | (defmethod noun-p ((language french) symbol) 15 | (member symbol '(:NC :NPP))) 16 | 17 | (defmethod adjective-p ((language french) symbol) 18 | (member symbol '(:ADJWH :ADJ))) 19 | 20 | (defmethod verb-p ((language french) symbol) 21 | (member symbol '(:V :VIMP :VINF :VPP :VPR :VS))) 22 | 23 | (defmethod adverb-p ((language french) symbol) 24 | (member symbol '(:ADV :ADVWH))) 25 | 26 | (defmethod determiner-p ((language french) symbol) 27 | (member symbol '(:DET :DETWH))) 28 | 29 | (defmethod pronoun-p ((language french) symbol) 30 | (member symbol '(:PRO :PROREL :PROWH))) 31 | 32 | (defun convert-sequoia-conll-file (in-file out-file) 33 | "Convert a Sequoia corpus file into standard word/pos format" 34 | (with-open-file (stream out-file 35 | :direction :output 36 | :if-exists :supersede 37 | :if-does-not-exist :create) 38 | (map-conll-corpus 39 | (lambda (sentence pos-seq) 40 | (dotimes (i (length sentence)) 41 | (format stream "~A/~A" (elt sentence i) (elt pos-seq i)) 42 | (if (= i (- (length sentence) 1)) 43 | (terpri stream) 44 | (format stream " ")))) 45 | in-file))) 46 | 47 | (defun convert-frwikinews-file (in-file out-file) 48 | "Convert an frwikinews corpus file into standard word/pos format" 49 | (with-open-file (out-stream out-file 50 | :direction :output 51 | :element-type 'character 52 | :external-format :utf-8 53 | :if-exists :supersede 54 | :if-does-not-exist :create) 55 | (with-open-file (in-stream in-file 56 | :direction :input 57 | :element-type 'character 58 | :external-format :utf-8) 59 | (do ((line (read-line in-stream nil :eof) (read-line in-stream nil :eof))) 60 | ((eql line :eof)) 61 | (let ((tokens (split "\\s+" line))) 62 | (dotimes (i (length tokens)) 63 | (let ((token (elt tokens i))) 64 | (let ((marker-position (position #\_ token :from-end t))) 65 | (let ((word (subseq token 0 marker-position)) 66 | (pos (subseq token (1+ marker-position)))) 67 | (format out-stream "~A/~A" word pos) 68 | (if (= i (- (length tokens) 1)) 69 | (format out-stream "~%") 70 | (format out-stream " "))))))))))) 71 | 72 | (defun flatten-french-phrase-tree (tree) 73 | (let ((leaf-phrases nil)) 74 | (labels ((dfs-helper (this-node phrase-type) 75 | (cond ((and (consp this-node) 76 | (atom (first this-node)) 77 | (atom (second this-node))) 78 | (let ((pos (symbol-name (first this-node)))) 79 | (unless (equalp pos "-NONE-") 80 | (let ((generic-pos 81 | ;; Leaving generic POS tags behind for now 82 | ;;(lookup-generic-pos :english pos))) 83 | (intern (string-upcase pos) :keyword))) 84 | (unless generic-pos 85 | (error "Unknown POS: '~A'" pos)) 86 | (push (list phrase-type generic-pos) leaf-phrases))))) 87 | ((and (consp this-node) 88 | (atom (first this-node)) 89 | (consp (second this-node))) 90 | (dolist (phrase (rest this-node)) 91 | (dfs-helper phrase (first this-node))))))) 92 | (handler-case 93 | (dfs-helper tree nil) 94 | (error (c) 95 | (log:error "~A" c) 96 | nil) 97 | (:no-error (rv) 98 | (declare (ignore rv)) 99 | (nreverse leaf-phrases)))))) 100 | 101 | (defmethod train-phrase-extractor ((language french) file) 102 | "Train the noun phrase extractor on a given french labeled corpus." 103 | (reset-chunker-tables language) 104 | (map-sexp-corpus 105 | (lambda (tree) 106 | (learn-phrase-pattern language (flatten-french-phrase-tree tree))) 107 | file) 108 | (compute-np-observation-likelihoods language) 109 | (compute-np-ngram-probabilities language) 110 | language) 111 | 112 | (defun make-french-db (&key 113 | profile-p 114 | save-p 115 | (user-lexicon "data/french-lexicon.txt") 116 | (stop-words-file "data/french-stop-words.txt") 117 | (pos-lex "data/french-pos.txt") 118 | (pos-train "data/french-pos.txt") 119 | (chunker-train "data/french-parsed.txt")) 120 | (let ((old-language (lookup-language "french"))) 121 | (when old-language 122 | (close-language old-language))) 123 | (let ((*language* (make-new-language 'french))) 124 | (setf (alphabet *language*) 125 | "áéíóúñü¡abcdefghijklmnopqrstuvwxyz0123456789-'/") 126 | (log:info "Training French NLP system...") 127 | (log:info "Building and training lexicon...") 128 | (load-stop-words *language* stop-words-file) 129 | (maybe-profile 130 | (make-lexicon *language* 131 | pos-lex 132 | :user-file user-lexicon 133 | :external-format :utf-8)) 134 | (log:info "Training POS tagger...") 135 | (maybe-profile (train-tagger *language* pos-train :external-format :utf-8)) 136 | (log:info "Training HMM Chunker...") 137 | (maybe-profile (train-phrase-extractor *language* chunker-train)) 138 | (when save-p 139 | (log:info "Freezing POS database...") 140 | (maybe-profile (freeze-nlp *language*))) 141 | (add-language *language*) 142 | *language*)) 143 | -------------------------------------------------------------------------------- /german.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defclass german (language) 4 | ((name :accessor name :initform "german" :initarg :name) 5 | (aliases :accessor aliases 6 | :initform '("de" "de-at" "de-de" "de-il" "de-lu" "de-ch") 7 | :initarg :aliases) 8 | (vowels :accessor vowels :initform '(#\a #\e #\i #\o #\u #\ä #\ö #\ü) 9 | :initarg :vowels) 10 | (default-encoding :accessor default-encoding :initform :utf-8 11 | :initarg :default-encoding))) 12 | 13 | (defmethod noun-p ((language german) symbol) 14 | (member symbol '(:NN :NE))) 15 | 16 | (defmethod adjective-p ((language german) symbol) 17 | (member symbol '(:ADJA :ADJD))) 18 | 19 | (defmethod verb-p ((language german) symbol) 20 | (member symbol '(:VAFIN :VAIMP :VAINF :VAPP :VMFIN :VMINF :VMPP :VVFIN :VVIMP :VVINF :VVIZU :VVPP))) 21 | 22 | (defmethod adverb-p ((language german) symbol) 23 | (member symbol '(:ADV :PAV))) 24 | 25 | (defmethod determiner-p ((language german) symbol) 26 | (member symbol '(:ART))) 27 | 28 | (defmethod pronoun-p ((language german) symbol) 29 | (member symbol '(:PDS :PDAT :PIS :PIAT :PIDAT :PPER :PPOSS :PPOSAT :PRELS :PRELAT :PRF :PWS :PWAT :PWAV))) 30 | 31 | (defmethod train-phrase-extractor ((language german) file) 32 | "Train the noun phrase extractor on a given german labeled corpus." 33 | (reset-chunker-tables language) 34 | (map-sexp-corpus 35 | (lambda (tree) 36 | (learn-phrase-pattern language (flatten-tiger-phrase-tree tree))) 37 | file) 38 | (compute-np-observation-likelihoods language) 39 | (compute-np-ngram-probabilities language) 40 | language) 41 | 42 | (defun make-german-db (&key 43 | profile-p 44 | save-p 45 | (user-lexicon "data/german-lexicon.txt") 46 | (stop-words-file "data/german-stop-words.txt") 47 | (pos-lex "data/german-pos.txt") 48 | (pos-train "data/german-pos.txt") 49 | (chunker-train "data/german-parsed.txt")) 50 | (let ((*language* (make-new-language 'german))) 51 | (setf (alphabet *language*) 52 | "áéíóúñü¡abcdefghijklmnopqrstuvwxyz0123456789-'/") 53 | (log:info "Training German NLP system...") 54 | (log:info "Building and training lexicon...") 55 | (load-stop-words *language* stop-words-file) 56 | (maybe-profile 57 | (make-lexicon *language* 58 | pos-lex 59 | :user-file user-lexicon 60 | :external-format :utf-8)) 61 | (log:info "Training POS tagger...") 62 | (maybe-profile (train-tagger *language* pos-train :external-format :utf-8)) 63 | (log:info "Training HMM Chunker...") 64 | (maybe-profile (train-phrase-extractor *language* chunker-train)) 65 | (when save-p 66 | (log:info "Freezing POS database...") 67 | (maybe-profile (freeze-nlp *language*))) 68 | (add-language *language*) 69 | *language*)) 70 | -------------------------------------------------------------------------------- /globals.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:nlp) 2 | 3 | (defvar *pos-db* nil) 4 | (defvar *language* nil) 5 | 6 | (defparameter *whitespace* '(#\Space #\Newline #\Return #\Tab)) 7 | 8 | (defparameter *sentence-start* (intern "" :keyword)) 9 | (defparameter *sentence-end* (intern "" :keyword)) 10 | 11 | (defparameter *punctuation* "([\\.\\!\\?])") 12 | -------------------------------------------------------------------------------- /initialize.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:nlp) 2 | 3 | (defmethod freeze-nlp ((language language) &optional file) 4 | (let ((file (or file (format nil "~A.dat" (name language))))) 5 | (let ((lexicon-dbm (lexicon-dbm language)) 6 | (plexicon-dbm (plexicon-dbm language)) 7 | (observations-dbm (observations-dbm language)) 8 | (word-occurrence-dbm (word-occurrence-dbm language))) 9 | (setf (lexicon-dbm language) nil 10 | (plexicon-dbm language) nil 11 | (observations-dbm language) nil 12 | (word-occurrence-dbm language) nil) 13 | (cl-store:store language file) 14 | (setf (lexicon-dbm language) lexicon-dbm 15 | (plexicon-dbm language) plexicon-dbm 16 | (observations-dbm language) observations-dbm 17 | (word-occurrence-dbm language) word-occurrence-dbm) 18 | language))) 19 | 20 | (defun thaw-nlp (file &key data-dir) 21 | (let ((language (cl-store:restore file))) 22 | (open-lexicon-dbm language :data-dir data-dir) 23 | (open-plexicon-dbm language :data-dir data-dir) 24 | (open-observations-dbm language :data-dir data-dir) 25 | (open-word-occurrence-dbm language :data-dir data-dir) 26 | (add-language language))) 27 | 28 | (defmacro maybe-profile ((&body body)) 29 | `(if profile-p 30 | (time (progn ,body)) 31 | (progn 32 | ,body))) 33 | -------------------------------------------------------------------------------- /italian.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defclass italian (language) 4 | ((name :accessor name :initform "italian" :initarg :name) 5 | (aliases :accessor aliases 6 | :initform '("it" "it-it" "it-ch") 7 | :initarg :aliases) 8 | (vowels :accessor vowels :initform 9 | '(#\a #\e #\i #\o #\u #\à #\è #\é #\ì #\í #\î #\ò #\ó #\ù #\ú) 10 | :initarg :vowels) 11 | (default-encoding :accessor default-encoding :initform :utf-8 12 | :initarg :default-encoding))) 13 | 14 | (defmethod noun-p ((language italian) symbol) 15 | (member symbol '(:N :S :SP))) 16 | 17 | (defmethod adjective-p ((language italian) symbol) 18 | (member symbol '(:A :AP))) 19 | 20 | (defmethod verb-p ((language italian) symbol) 21 | (member symbol '(:V))) 22 | 23 | (defmethod adverb-p ((language italian) symbol) 24 | (member symbol '(:B :BN))) 25 | 26 | (defmethod determiner-p ((language italian) symbol) 27 | (member symbol '(:D :T :DD :DE :DQ :DR))) 28 | 29 | (defmethod pronoun-p ((language italian) symbol) 30 | (member symbol '(:P :PC :PD :PE :PE :PP :PQ :PR))) 31 | 32 | (defmethod load-contraction-table ((language italian) 33 | &optional 34 | (file "data/italian-contractions.txt")) 35 | (when (probe-file file) 36 | (with-open-file (in file) 37 | (setf (contraction-table language) (make-hash-table :test 'equalp)) 38 | (do ((line (read-line in nil :eof) (read-line in nil :eof))) 39 | ((eql line :eof)) 40 | (multiple-value-bind (match-p matches) 41 | (scan-to-strings "^([a-zA-Z']+)\\s+(.*)$" line) 42 | (when match-p 43 | (setf (gethash (elt matches 0) (contraction-table language)) 44 | (split "\\s+" (elt matches 1))))))))) 45 | 46 | (defun make-italian-db (&key 47 | profile-p 48 | save-p 49 | (contraction-file "data/italian-contractions.txt") 50 | (user-lexicon "data/italian-lexicon.txt") 51 | (stop-words-file "data/italian-stop-words.txt") 52 | (pos-lex "data/italian-pos-all.txt") 53 | (pos-train "data/italian-pos.txt") 54 | (chunker-train "data/italian-parsed.txt")) 55 | (let ((*language* (make-new-language 'italian))) 56 | (setf (alphabet *language*) 57 | "àèéìíîòóùúabcdefghijklmnopqrstuvwxyz0123456789-'/") 58 | (log:info "Training Italian NLP system...") 59 | (log:info "Building and training lexicon...") 60 | (load-stop-words *language* stop-words-file) 61 | (load-contraction-table *language* contraction-file) 62 | (maybe-profile 63 | (make-lexicon *language* 64 | pos-lex 65 | :user-file user-lexicon 66 | :external-format :utf-8)) 67 | ;;(clean-lexicon *language*) 68 | (log:info "Training POS tagger...") 69 | (maybe-profile (train-tagger *language* pos-train :external-format :utf-8)) 70 | (log:info "Training HMM Chunker...") 71 | (maybe-profile (train-phrase-extractor *language* chunker-train)) 72 | (when save-p 73 | (log:info "Freezing POS database...") 74 | (maybe-profile (freeze-nlp *language*))) 75 | (add-language *language*) 76 | *language*)) 77 | -------------------------------------------------------------------------------- /lgrammar.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:nlp) 2 | 3 | (defun print-lex-sym (ls stream d) 4 | (declare (ignore d)) 5 | (format stream "#" 6 | (sym ls) (word ls) (pos ls) (p ls))) 7 | 8 | (defstruct (lex-sym 9 | (:conc-name nil) 10 | (:print-function print-lex-sym)) 11 | sym word pos p) 12 | 13 | (defmethod lex-sym-equal ((s1 lex-sym) (s2 lex-sym)) 14 | (and (eq (sym s1) (sym s2)) 15 | (equal (word s1) (word s2)) 16 | (eq (pos s1) (pos s2)))) 17 | 18 | (defun sxhash-ls (ls) 19 | (sxhash (list (sym ls) (word ls) (pos ls)))) 20 | 21 | (sb-ext:define-hash-table-test lex-sym-equal sxhash-ls) 22 | 23 | (defun make-lcfg-table (&key synchronized) 24 | (make-hash-table :test 'lex-sym-equal :synchronized synchronized)) 25 | 26 | (defun extract-sentence-lgrammar (tree) 27 | (let ((grammar (make-lcfg-table)) (start-symbol (first tree))) 28 | (labels 29 | ((walk (tree) 30 | (cond ((null tree) nil) 31 | ((and (consp tree) 32 | (atom (first tree)) 33 | (not (eql '-NONE- (first tree))) 34 | (consp (second tree))) 35 | (let ((children nil)) 36 | (dolist (child (rest tree)) 37 | (push (first child) children) 38 | (walk child)) 39 | (if (gethash (first tree) grammar) 40 | (pushnew (reverse children) 41 | (gethash (first tree) grammar) 42 | :test 'equalp) 43 | (setf (gethash (first tree) grammar) 44 | (list (reverse children)))))) 45 | ((and (consp tree) 46 | (atom (first tree)) 47 | (not (eql '-NONE- (first tree))) 48 | (atom (second tree))) 49 | (when (numberp (second tree)) 50 | (setf (second tree) 51 | (intern (format nil "~D" (second tree))))) 52 | (let ((sym (make-lex-sym :sym (first tree) 53 | :word (second tree) 54 | :pos (first tree)))) 55 | (if (gethash sym grammar) 56 | (pushnew (symbol-name (second tree)) 57 | (gethash (first tree) grammar) 58 | :test 'equalp) 59 | (setf (gethash (first tree) grammar) 60 | (list (symbol-name (second tree)))))) 61 | ((consp tree) 62 | (dolist (subtree tree) 63 | (walk subtree)))))) 64 | (walk tree) 65 | (values grammar start-symbol))))) 66 | -------------------------------------------------------------------------------- /lib/cffi-wordnet/LICENSE: -------------------------------------------------------------------------------- 1 | cffi-wordnet 2 | 3 | Copyright (c) 2015 Kevin Raison 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the "Software"), to deal in the Software without 8 | restriction, including without limitation the rights to use, 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the 11 | Software is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /lib/cffi-wordnet/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/cffi-wordnet/README -------------------------------------------------------------------------------- /lib/cffi-wordnet/cffi-wordnet.asd: -------------------------------------------------------------------------------- 1 | ;; ASDF package description for cffi-wordnet -*- Lisp -*- 2 | 3 | (defpackage :cffi-wordnet-system (:use :cl :asdf)) 4 | (in-package :cffi-wordnet-system) 5 | 6 | (defsystem cffi-wordnet 7 | :name "cffi-wordnet" 8 | :maintainer "Kevin Raison" 9 | :author "Kevin Raison " 10 | :version "0.1" 11 | :description "CFFI Interface to Wordnet 3.0" 12 | :depends-on (:cffi 13 | :cl-ppcre 14 | :graph-utils 15 | :alexandria) 16 | :components ((:file "wordnet-package") 17 | (:file "cffi" :depends-on ("wordnet-package")) 18 | (:file "constants" :depends-on ("cffi")) 19 | (:file "wordnet" :depends-on ("constants")))) 20 | -------------------------------------------------------------------------------- /lib/cffi-wordnet/constants.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:cffi-wordnet) 2 | 3 | (defparameter +noun+ NOUN) 4 | (defparameter +verb+ VERB) 5 | (defparameter +adjective+ ADJ) 6 | (defparameter +adverb+ ADV) 7 | (defparameter +satellite+ SATELLITE) 8 | (defparameter +all-senses+ ALLSENSES) 9 | (defparameter +synonyms+ SYNS) 10 | (defparameter +freq+ FREQ) 11 | (defparameter +frames+ FRAMES) 12 | (defparameter +coords+ COORDS) 13 | (defparameter +relatives+ RELATIVES) 14 | (defparameter +meronym+ HMERONYM) 15 | (defparameter +holonym+ HHOLONYM) 16 | (defparameter +hypernym+ HYPERPTR) 17 | -------------------------------------------------------------------------------- /lib/cffi-wordnet/wordnet-package.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:cl-user) 2 | 3 | (defpackage #:cffi-wordnet 4 | (:use #:cl #:cffi #:graph-utils) 5 | (:nicknames #:wordnet) 6 | (:export #:wordnet-init 7 | ;;#:synonyms 8 | ;;#:hypernyms 9 | ;;#:holonyms 10 | #:synset-vertex 11 | #:make-synset-vertex 12 | #:morph-word 13 | #:hypernym-graph 14 | #:holonym-graph 15 | #:synset-vertex 16 | #:word-list 17 | #:+noun+ 18 | #:+verb+ 19 | #:+adjective+ 20 | #:+adverb+ 21 | #:+satellite+ 22 | #:+all-senses+ 23 | #:+synonyms+ 24 | #:+holonym+ 25 | #:+hypernym+ 26 | #:+meronym+)) 27 | -------------------------------------------------------------------------------- /lib/cffi-wordnet/wordnet.swig: -------------------------------------------------------------------------------- 1 | %module wordnet 2 | 3 | %include "/usr/local/WordNet-3.0/include/wn.h" 4 | 5 | -------------------------------------------------------------------------------- /lib/graph-utils/LICENSE: -------------------------------------------------------------------------------- 1 | graph-utils is licensed under the terms of the Lisp Lesser GNU 2 | Public License (http://opensource.franz.com/preamble.html), known as 3 | the LLGPL. The LLGPL consists of a preamble (see above URL) and the 4 | LGPL. Where these conflict, the preamble takes precedence. 5 | graph-utils is referenced in the preamble as the "LIBRARY." 6 | -------------------------------------------------------------------------------- /lib/graph-utils/README: -------------------------------------------------------------------------------- 1 | graph-utils: a graph analysis library for Common Lisp 2 | by: Kevin Raison 3 | dependencies: cl-ppcre, dso-lex, cl-yacc, trivial-shell, parse-number, bordeaux-threads 4 | 5 | Implements the following functionality: 6 | 7 | 1. Create directed and undirected graphs with typed or untyped edges 8 | 2. Compare graphs using graph= method 9 | 3. Sparse 2D array representation of adjacency matrix 10 | 4. Neighbors, outbound-edges and inbound-edges methods for listing a node's neighbors in directed and undirected graphs 11 | 5. Edge deletion and creation methods 12 | 6. Functions for mapping over all nodes and edges 13 | 7. Denisty calculation 14 | 8. Node degree & degree distribution calculation 15 | 9. An implementation of Dijkstra's algorithm 16 | 10. Distance map calculation 17 | 11. Methods to find all components in a graph 18 | 12. Graph visualization using the Graphviz library 19 | 13. Random graph generation using the erdos-renyi and barabasi-albert algorithms 20 | 14. Clustering algorithms based on edge betweenness and edge span 21 | 15. Page rank and page rank distribution calculation 22 | 16. Hubs and authorities calculation 23 | 17. Graph center calculator 24 | 18. Maximum flow calculation for directed graphs using The Push-Relabel method, Karzanov's algorithm, Dinic's algorithm, and the Edmond/Karp method 25 | 19. Maximum matching for bipartite graphs using the max-flow method 26 | 20. GML and Pajek .net graph file format parsers 27 | 21. A prolog implementation for searching graphs based on PAIP 28 | 29 | -------------------------------------------------------------------------------- /lib/graph-utils/TODO: -------------------------------------------------------------------------------- 1 | TODO: 2 | - Add a node class to make life easier. 3 | - Add an edge class to make life easier. 4 | - Optimize! 5 | 6 | DONE: 7 | - Add typed edges. 8 | - Get rid of the adjacency matrix and replace with sparse representation. 9 | - Create a directed graph subclass and update methods for it. 10 | - Add maximum flow methods for directed graphs 11 | - Use max flow algorithms to compute maximum matchings for bipartite graphs. 12 | -------------------------------------------------------------------------------- /lib/graph-utils/bipartite.lisp: -------------------------------------------------------------------------------- 1 | (in-package :graph-utils) 2 | 3 | (defmethod bipartite? ((graph graph) &key show-partitions?) 4 | "Checks if a graph is bipartite. If show-partitions? is T, return the 5 | partitions." 6 | (let ((color-table (make-hash-table)) 7 | (partition-table (make-hash-table)) 8 | (queue (make-empty-queue)) 9 | (components (find-components graph))) 10 | (flet ((set-color (node color) 11 | (setf (gethash node color-table) color)) 12 | (color-of (node) 13 | (gethash node color-table)) 14 | (partition-of (node) 15 | (gethash node partition-table)) 16 | (set-partition (node p) 17 | (setf (gethash node partition-table) p))) 18 | (map-nodes #'(lambda (name n) 19 | (declare (ignore name)) 20 | (set-partition n 0) 21 | (set-color n :white)) 22 | graph) 23 | (dolist (component components) 24 | (let ((start (nth (random (length component)) component))) 25 | (set-partition start 1) 26 | (enqueue queue start) 27 | (set-color start :grey) 28 | (loop until (empty-queue? queue) do 29 | (let* ((node (dequeue queue))) 30 | (dolist (neighbor (neighbors graph node)) 31 | (when (= (partition-of neighbor) (partition-of node)) 32 | (return-from bipartite? nil)) 33 | (when (eql :white (color-of neighbor)) 34 | (set-color neighbor :gray) 35 | (set-partition neighbor (- 3 (partition-of node))) 36 | (enqueue queue neighbor))) 37 | (set-color node :black))))) 38 | (if show-partitions? 39 | (let ((a nil) (b nil)) 40 | (maphash #'(lambda (node p) 41 | (if (eq p 1) 42 | (push node a) 43 | (push node b))) 44 | partition-table) 45 | (values a b)) 46 | t)))) 47 | 48 | (defmethod compute-maximum-matching ((graph graph) v1 v2 49 | &key (algorithm :dinic)) 50 | "Compute a maximum matching for the bipartite graph with partitions v1 and 51 | v2." 52 | (let* ((flow-net (make-graph :directed? t))) 53 | (map-nodes #'(lambda (name id) 54 | (declare (ignore name)) 55 | (add-node flow-net id)) 56 | graph) 57 | (map-edges #'(lambda (n1 n2 w) 58 | (declare (ignore w)) 59 | (if (member n1 v2) 60 | (add-edge flow-net n2 n1 :weight 1) 61 | (add-edge flow-net n1 n2 :weight 1))) 62 | graph) 63 | (let ((source (add-node flow-net :source)) 64 | (sink (add-node flow-net :sink))) 65 | (dolist (node v1) 66 | (add-edge flow-net source node :weight 1)) 67 | (dolist (node v2) 68 | (add-edge flow-net node sink :weight 1)) 69 | ;;(visualize flow-net :render? t 70 | ;;:file (format nil "data/matching-~A.dot" algorithm)) 71 | (multiple-value-bind (flow edges) 72 | (compute-maximum-flow flow-net source sink :algorithm algorithm) 73 | (values 74 | (mapcar #'(lambda (edge) 75 | (subseq edge 0 2)) 76 | (remove-if #'(lambda (edge) 77 | (or (/= 1 (nth 2 edge)) 78 | (eq (first edge) source) 79 | (eq (first edge) sink) 80 | (eq (second edge) source) 81 | (eq (second edge) sink))) 82 | edges)) 83 | v1 84 | v2 85 | flow))))) 86 | 87 | (defun test-bp () 88 | (let ((graph 89 | (parse-pajek "/home/raison/work/graph-utils/data/bipartite1.net"))) 90 | (multiple-value-bind (black white) (bipartite? graph :show-partitions? t) 91 | (if (and black white) 92 | (if (every #'evenp black) 93 | (multiple-value-bind (matching black white flow) 94 | (compute-maximum-matching graph black white 95 | :algorithm :karzanov) 96 | (dbg "~A~%~A~%~A~%~A" matching black white flow) 97 | matching) 98 | (multiple-value-bind (matching black white flow) 99 | (compute-maximum-matching graph white black 100 | :algorithm :karzanov) 101 | (dbg "~A~%~A~%~A~%~A" matching black white flow) 102 | matching)) 103 | (error "~A is not bipartite!" graph))))) 104 | 105 | -------------------------------------------------------------------------------- /lib/graph-utils/cut.lisp: -------------------------------------------------------------------------------- 1 | (in-package :graph-utils) 2 | 3 | (defmethod minimal-cut! ((graph graph)) 4 | (let ((removed-edges nil)) 5 | (labels ((cut (g) 6 | (cond ((or (< (node-count g) 2) 7 | (= 0 (edge-count g)) 8 | (>= (length (find-components g)) 2)) 9 | g) 10 | (t 11 | (push (first (cluster graph :edge-span 12 | :edge-removal-count 1)) 13 | removed-edges) 14 | (cut graph))))) 15 | (cut graph)) 16 | (mapcar #'(lambda (edge) 17 | (subseq edge 0 2)) 18 | removed-edges))) 19 | 20 | (defmethod minimal-cut ((graph graph) &key (method :cluster)) 21 | (let ((g (copy-graph graph))) 22 | (cond ((eq method :cluster) 23 | (values (minimal-cut! g) g))))) 24 | 25 | -------------------------------------------------------------------------------- /lib/graph-utils/data/Dining-cut.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "Robin" -> "Helen" [w=1,label=1]; 6 | "Robin" -> "Eva" [w=1,label=1]; 7 | "Robin" [fillcolor="#ffff00"]; 8 | "Lena" -> "Louise" [w=1,label=1]; 9 | "Lena" [fillcolor="#ffff00"]; 10 | "Ella" -> "Helen" [w=1,label=1]; 11 | "Ella" -> "Ellen" [w=1,label=1]; 12 | "Ella" [fillcolor="#ffff00"]; 13 | "Ellen" -> "Edna" [w=1,label=1]; 14 | "Ellen" [fillcolor="#ffff00"]; 15 | "Edna" -> "Adele" [w=1,label=1]; 16 | "Edna" -> "Mary" [w=1,label=1]; 17 | "Edna" [fillcolor="#ffff00"]; 18 | "Maxine" -> "Adele" [w=1,label=1]; 19 | "Maxine" [fillcolor="#ffff00"]; 20 | "Louise" -> "Marion" [w=1,label=1]; 21 | "Louise" [fillcolor="#ffff00"]; 22 | "Laura" -> "Edna" [w=1,label=1]; 23 | "Laura" [fillcolor="#ffff00"]; 24 | "Ruth" -> "Hilda" [w=1,label=1]; 25 | "Ruth" [fillcolor="#ffff00"]; 26 | "Cora" -> "Ada" [w=1,label=1]; 27 | "Cora" -> "Jean" [w=1,label=1]; 28 | "Cora" [fillcolor="#ffff00"]; 29 | "Anna" -> "Lena" [w=1,label=1]; 30 | "Anna" [fillcolor="#ffff00"]; 31 | "Jane" -> "Mary" [w=1,label=1]; 32 | "Jane" [fillcolor="#ffff00"]; 33 | "Alice" -> "Martha" [w=1,label=1]; 34 | "Alice" [fillcolor="#ffff00"]; 35 | "Hilda" -> "Hazel" [w=1,label=1]; 36 | "Hilda" -> "Betty" [w=1,label=1]; 37 | "Hilda" [fillcolor="#ffff00"]; 38 | "Hazel" -> "Hilda" [w=1,label=1]; 39 | "Hazel" [fillcolor="#ffff00"]; 40 | "Martha" -> "Anna" [w=1,label=1]; 41 | "Martha" [fillcolor="#ffff00"]; 42 | "Frances" -> "Eva" [w=1,label=1]; 43 | "Frances" [fillcolor="#ffff00"]; 44 | "Helen" -> "Jean" [w=1,label=1]; 45 | "Helen" [fillcolor="#ffff00"]; 46 | "Marion" -> "Martha" [w=1,label=1]; 47 | "Marion" -> "Frances" [w=1,label=1]; 48 | "Marion" [fillcolor="#ffff00"]; 49 | "Eva" -> "Maxine" [w=1,label=1]; 50 | "Eva" [fillcolor="#ffff00"]; 51 | "Irene" [fillcolor="#ffff00"]; 52 | "Adele" -> "Marion" [w=1,label=1]; 53 | "Adele" [fillcolor="#ffff00"]; 54 | "Betty" -> "Hilda" [w=1,label=1]; 55 | "Betty" -> "Edna" [w=1,label=1]; 56 | "Betty" [fillcolor="#ffff00"]; 57 | "Ada" -> "Cora" [w=1,label=1]; 58 | "Ada" [fillcolor="#ffff00"]; 59 | "Mary" -> "Edna" [w=1,label=1]; 60 | "Mary" -> "Jane" [w=1,label=1]; 61 | "Mary" [fillcolor="#ffff00"]; 62 | "Jean" -> "Robin" [w=1,label=1]; 63 | "Jean" [fillcolor="#ffff00"]; 64 | } 65 | -------------------------------------------------------------------------------- /lib/graph-utils/data/Dining-cut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/Dining-cut.png -------------------------------------------------------------------------------- /lib/graph-utils/data/Dining.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "Robin" -> "Helen" [w=1,label=1]; 6 | "Robin" -> "Eva" [w=1,label=1]; 7 | "Robin" [fillcolor="#ffff00"]; 8 | "Lena" -> "Louise" [w=1,label=1]; 9 | "Lena" -> "Marion" [w=1,label=1]; 10 | "Lena" [fillcolor="#ffff00"]; 11 | "Ella" -> "Helen" [w=1,label=1]; 12 | "Ella" -> "Ellen" [w=1,label=1]; 13 | "Ella" [fillcolor="#ffff00"]; 14 | "Ellen" -> "Edna" [w=1,label=1]; 15 | "Ellen" -> "Anna" [w=1,label=1]; 16 | "Ellen" [fillcolor="#ffff00"]; 17 | "Edna" -> "Adele" [w=1,label=1]; 18 | "Edna" -> "Mary" [w=1,label=1]; 19 | "Edna" [fillcolor="#ffff00"]; 20 | "Maxine" -> "Eva" [w=1,label=1]; 21 | "Maxine" -> "Adele" [w=1,label=1]; 22 | "Maxine" [fillcolor="#ffff00"]; 23 | "Louise" -> "Marion" [w=1,label=1]; 24 | "Louise" -> "Lena" [w=1,label=1]; 25 | "Louise" [fillcolor="#ffff00"]; 26 | "Laura" -> "Eva" [w=1,label=1]; 27 | "Laura" -> "Edna" [w=1,label=1]; 28 | "Laura" [fillcolor="#ffff00"]; 29 | "Ruth" -> "Hilda" [w=1,label=1]; 30 | "Ruth" -> "Jane" [w=1,label=1]; 31 | "Ruth" [fillcolor="#ffff00"]; 32 | "Cora" -> "Ada" [w=1,label=1]; 33 | "Cora" -> "Jean" [w=1,label=1]; 34 | "Cora" [fillcolor="#ffff00"]; 35 | "Anna" -> "Maxine" [w=1,label=1]; 36 | "Anna" -> "Lena" [w=1,label=1]; 37 | "Anna" [fillcolor="#ffff00"]; 38 | "Jane" -> "Adele" [w=1,label=1]; 39 | "Jane" -> "Mary" [w=1,label=1]; 40 | "Jane" [fillcolor="#ffff00"]; 41 | "Alice" -> "Martha" [w=1,label=1]; 42 | "Alice" -> "Eva" [w=1,label=1]; 43 | "Alice" [fillcolor="#ffff00"]; 44 | "Hilda" -> "Hazel" [w=1,label=1]; 45 | "Hilda" -> "Betty" [w=1,label=1]; 46 | "Hilda" [fillcolor="#ffff00"]; 47 | "Hazel" -> "Hilda" [w=1,label=1]; 48 | "Hazel" -> "Anna" [w=1,label=1]; 49 | "Hazel" [fillcolor="#ffff00"]; 50 | "Martha" -> "Marion" [w=1,label=1]; 51 | "Martha" -> "Anna" [w=1,label=1]; 52 | "Martha" [fillcolor="#ffff00"]; 53 | "Frances" -> "Marion" [w=1,label=1]; 54 | "Frances" -> "Eva" [w=1,label=1]; 55 | "Frances" [fillcolor="#ffff00"]; 56 | "Helen" -> "Jean" [w=1,label=1]; 57 | "Helen" -> "Eva" [w=1,label=1]; 58 | "Helen" [fillcolor="#ffff00"]; 59 | "Marion" -> "Martha" [w=1,label=1]; 60 | "Marion" -> "Frances" [w=1,label=1]; 61 | "Marion" [fillcolor="#ffff00"]; 62 | "Eva" -> "Marion" [w=1,label=1]; 63 | "Eva" -> "Maxine" [w=1,label=1]; 64 | "Eva" [fillcolor="#ffff00"]; 65 | "Irene" -> "Hilda" [w=1,label=1]; 66 | "Irene" -> "Ellen" [w=1,label=1]; 67 | "Irene" [fillcolor="#ffff00"]; 68 | "Adele" -> "Marion" [w=1,label=1]; 69 | "Adele" -> "Frances" [w=1,label=1]; 70 | "Adele" [fillcolor="#ffff00"]; 71 | "Betty" -> "Hilda" [w=1,label=1]; 72 | "Betty" -> "Edna" [w=1,label=1]; 73 | "Betty" [fillcolor="#ffff00"]; 74 | "Ada" -> "Cora" [w=1,label=1]; 75 | "Ada" -> "Louise" [w=1,label=1]; 76 | "Ada" [fillcolor="#ffff00"]; 77 | "Mary" -> "Edna" [w=1,label=1]; 78 | "Mary" -> "Jane" [w=1,label=1]; 79 | "Mary" [fillcolor="#ffff00"]; 80 | "Jean" -> "Helen" [w=1,label=1]; 81 | "Jean" -> "Robin" [w=1,label=1]; 82 | "Jean" [fillcolor="#ffff00"]; 83 | } 84 | -------------------------------------------------------------------------------- /lib/graph-utils/data/Dining.net: -------------------------------------------------------------------------------- 1 | *Vertices 26 2 | 1 "Ada" 0.1646 0.2144 0.5000 3 | 2 "Cora" 0.0481 0.3869 0.5000 4 | 3 "Louise" 0.3472 0.1913 0.5000 5 | 4 "Jean" 0.1063 0.5935 0.5000 6 | 5 "Helen" 0.2892 0.6688 0.5000 7 | 6 "Martha" 0.4630 0.5179 0.5000 8 | 7 "Alice" 0.3657 0.6326 0.5000 9 | 8 "Robin" 0.2274 0.5741 0.5000 10 | 9 "Marion" 0.4288 0.3271 0.5000 11 | 10 "Maxine" 0.5219 0.4468 0.5000 12 | 11 "Lena" 0.5334 0.2248 0.5000 13 | 12 "Hazel" 0.7166 0.6744 0.5000 14 | 13 "Hilda" 0.8755 0.6811 0.5000 15 | 14 "Frances" 0.4968 0.3786 0.5000 16 | 15 "Eva" 0.3722 0.4767 0.5000 17 | 16 "Ruth" 0.9519 0.4789 0.5000 18 | 17 "Edna" 0.6895 0.4929 0.5000 19 | 18 "Adele" 0.6301 0.3246 0.5000 20 | 19 "Jane" 0.8231 0.3408 0.5000 21 | 20 "Anna" 0.5722 0.5177 0.5000 22 | 21 "Mary" 0.7594 0.4263 0.5000 23 | 22 "Betty" 0.8458 0.5521 0.5000 24 | 23 "Ella" 0.4343 0.7865 0.5000 25 | 24 "Ellen" 0.6054 0.6966 0.5000 26 | 25 "Laura" 0.5101 0.6133 0.5000 27 | 26 "Irene" 0.7478 0.8087 0.5000 28 | *Arcs 29 | 1 3 30 | 1 2 31 | 2 1 32 | 2 4 33 | 3 9 34 | 3 11 35 | 4 5 36 | 4 8 37 | 5 4 38 | 5 15 39 | 6 9 40 | 6 20 41 | 7 15 42 | 7 6 43 | 8 15 44 | 8 5 45 | 9 6 46 | 9 14 47 | 10 18 48 | 10 15 49 | 11 9 50 | 11 3 51 | 12 13 52 | 12 20 53 | 13 22 54 | 13 12 55 | 14 15 56 | 14 9 57 | 15 10 58 | 15 9 59 | 16 19 60 | 16 13 61 | 17 21 62 | 17 18 63 | 18 14 64 | 18 9 65 | 19 18 66 | 19 21 67 | 20 10 68 | 20 11 69 | 21 17 70 | 21 19 71 | 22 17 72 | 22 13 73 | 23 24 74 | 23 5 75 | 24 20 76 | 24 17 77 | 25 15 78 | 25 17 79 | 26 13 80 | 26 24 -------------------------------------------------------------------------------- /lib/graph-utils/data/Dining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/Dining.png -------------------------------------------------------------------------------- /lib/graph-utils/data/adjnoun-cut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/adjnoun-cut.png -------------------------------------------------------------------------------- /lib/graph-utils/data/adjnoun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/adjnoun.png -------------------------------------------------------------------------------- /lib/graph-utils/data/adjnoun.txt: -------------------------------------------------------------------------------- 1 | The file adjnoun.gml contains the network of common adjective and noun 2 | adjacencies for the novel "David Copperfield" by Charles Dickens, as 3 | described by M. Newman. Nodes represent the most commonly occurring 4 | adjectives and nouns in the book. Node values are 0 for adjectives and 1 5 | for nouns. Edges connect any pair of words that occur in adjacent position 6 | in the text of the book. Please cite M. E. J. Newman, Finding community 7 | structure in networks using the eigenvectors of matrices, Preprint 8 | physics/0605087 (2006). 9 | -------------------------------------------------------------------------------- /lib/graph-utils/data/bipartite.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "0" -> "1" [w=14,label=14]; 6 | "0" [fillcolor="#ffff00"]; 7 | "1" [fillcolor="#ffff00"]; 8 | "2" [fillcolor="#ffff00"]; 9 | "3" -> "1" [w=2,label=2]; 10 | "3" -> "2" [w=6,label=6]; 11 | "3" [fillcolor="#ffff00"]; 12 | } 13 | -------------------------------------------------------------------------------- /lib/graph-utils/data/bipartite.net: -------------------------------------------------------------------------------- 1 | *Vertices 4 2 | 0 0 3 | 1 1 4 | 2 2 5 | 3 3 6 | *Arcs 7 | 0 1 14 8 | 3 1 2 9 | 3 2 6 10 | -------------------------------------------------------------------------------- /lib/graph-utils/data/bipartite1.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "0" -> "1" [w=1,label=1]; 6 | "0" [fillcolor="#ffff00"]; 7 | "1" -> "4" [w=1,label=1]; 8 | "1" -> "10" [w=1,label=1]; 9 | "1" [fillcolor="#ffff00"]; 10 | "2" -> "3" [w=1,label=1]; 11 | "2" -> "5" [w=1,label=1]; 12 | "2" [fillcolor="#ffff00"]; 13 | "3" [fillcolor="#ffff00"]; 14 | "4" -> "5" [w=1,label=1]; 15 | "4" [fillcolor="#ffff00"]; 16 | "5" [fillcolor="#ffff00"]; 17 | "6" -> "5" [w=1,label=1]; 18 | "6" -> "7" [w=1,label=1]; 19 | "6" [fillcolor="#ffff00"]; 20 | "7" [fillcolor="#ffff00"]; 21 | "8" -> "9" [w=1,label=1]; 22 | "8" [fillcolor="#ffff00"]; 23 | "9" -> "10" [w=1,label=1]; 24 | "9" [fillcolor="#ffff00"]; 25 | "10" [fillcolor="#ffff00"]; 26 | "11" -> "10" [w=1,label=1]; 27 | "11" [fillcolor="#ffff00"]; 28 | } 29 | -------------------------------------------------------------------------------- /lib/graph-utils/data/bipartite1.net: -------------------------------------------------------------------------------- 1 | *Vertices 12 2 | 0 0 3 | 1 1 4 | 2 2 5 | 3 3 6 | 4 4 7 | 5 5 8 | 6 6 9 | 7 7 10 | 8 8 11 | 9 9 12 | 10 10 13 | 11 11 14 | *Arcs 15 | 0 1 1 16 | 2 3 1 17 | 4 5 1 18 | 6 7 1 19 | 1 4 1 20 | 2 5 1 21 | 6 5 1 22 | 8 9 1 23 | 1 10 1 24 | 11 10 1 25 | 9 10 1 26 | -------------------------------------------------------------------------------- /lib/graph-utils/data/dolphins-cut.dot: -------------------------------------------------------------------------------- 1 | graph graphutils { 2 | node [ color = black, fillcolor = while, style = filled ]; 3 | "Beak" -- "Fish" [w=1]; 4 | "Beak" -- "Grin" [w=1]; 5 | "Beak" -- "Haecksel" [w=1]; 6 | "Beak" -- "SN9" [w=1]; 7 | "Beak" -- "SN96" [w=1]; 8 | "Beak" -- "TR77" [w=1]; 9 | "Beak" [fillcolor="#ffff00"]; 10 | "Beescratch" -- "Jet" [w=1]; 11 | "Beescratch" -- "Knit" [w=1]; 12 | "Beescratch" -- "Notch" [w=1]; 13 | "Beescratch" -- "Number1" [w=1]; 14 | "Beescratch" [fillcolor="#ffff00"]; 15 | "Bumper" -- "Fish" [w=1]; 16 | "Bumper" [fillcolor="#ffff00"]; 17 | "CCL" -- "Double" [w=1]; 18 | "CCL" -- "Zap" [w=1]; 19 | "CCL" [fillcolor="#ffff00"]; 20 | "Cross" -- "Trigger" [w=1]; 21 | "Cross" [fillcolor="#ffff00"]; 22 | "DN16" -- "Feather" [w=1]; 23 | "DN16" -- "Gallatin" [w=1]; 24 | "DN16" -- "Wave" [w=1]; 25 | "DN16" -- "Web" [w=1]; 26 | "DN16" [fillcolor="#ffff00"]; 27 | "DN21" -- "Feather" [w=1]; 28 | "DN21" -- "Jet" [w=1]; 29 | "DN21" -- "Upbang" [w=1]; 30 | "DN21" [fillcolor="#ffff00"]; 31 | "DN63" -- "Knit" [w=1]; 32 | "DN63" -- "PL" [w=1]; 33 | "DN63" [fillcolor="#ffff00"]; 34 | "Double" -- "Kringel" [w=1]; 35 | "Double" -- "Oscar" [w=1]; 36 | "Double" [fillcolor="#ffff00"]; 37 | "Feather" -- "Ripplefluke" [w=1]; 38 | "Feather" -- "SN90" [w=1]; 39 | "Feather" [fillcolor="#ffff00"]; 40 | "Fish" [fillcolor="#ffff00"]; 41 | "Five" -- "Trigger" [w=1]; 42 | "Five" [fillcolor="#ffff00"]; 43 | "Fork" -- "Scabs" [w=1]; 44 | "Fork" [fillcolor="#ffff00"]; 45 | "Gallatin" [fillcolor="#ffff00"]; 46 | "Grin" -- "Hook" [w=1]; 47 | "Grin" -- "Scabs" [w=1]; 48 | "Grin" -- "Shmuddel" [w=1]; 49 | "Grin" -- "SN4" [w=1]; 50 | "Grin" -- "SN63" [w=1]; 51 | "Grin" -- "Stripes" [w=1]; 52 | "Grin" -- "TR99" [w=1]; 53 | "Grin" -- "TSN103" [w=1]; 54 | "Grin" [fillcolor="#ffff00"]; 55 | "Haecksel" -- "Jonah" [w=1]; 56 | "Haecksel" -- "MN83" [w=1]; 57 | "Haecksel" -- "Topless" [w=1]; 58 | "Haecksel" -- "Vau" [w=1]; 59 | "Haecksel" [fillcolor="#ffff00"]; 60 | "Hook" -- "Kringel" [w=1]; 61 | "Hook" [fillcolor="#ffff00"]; 62 | "Jet" -- "MN23" [w=1]; 63 | "Jet" -- "Mus" [w=1]; 64 | "Jet" -- "Quasi" [w=1]; 65 | "Jet" [fillcolor="#ffff00"]; 66 | "Jonah" -- "MN105" [w=1]; 67 | "Jonah" -- "Patchback" [w=1]; 68 | "Jonah" -- "Trigger" [w=1]; 69 | "Jonah" [fillcolor="#ffff00"]; 70 | "Knit" [fillcolor="#ffff00"]; 71 | "Kringel" -- "Thumper" [w=1]; 72 | "Kringel" [fillcolor="#ffff00"]; 73 | "MN105" [fillcolor="#ffff00"]; 74 | "MN23" [fillcolor="#ffff00"]; 75 | "MN60" -- "Topless" [w=1]; 76 | "MN60" [fillcolor="#ffff00"]; 77 | "MN83" [fillcolor="#ffff00"]; 78 | "Mus" [fillcolor="#ffff00"]; 79 | "Notch" [fillcolor="#ffff00"]; 80 | "Number1" [fillcolor="#ffff00"]; 81 | "Oscar" -- "PL" [w=1]; 82 | "Oscar" [fillcolor="#ffff00"]; 83 | "Patchback" -- "SMN5" [w=1]; 84 | "Patchback" [fillcolor="#ffff00"]; 85 | "PL" [fillcolor="#ffff00"]; 86 | "Quasi" [fillcolor="#ffff00"]; 87 | "Ripplefluke" -- "Zig" [w=1]; 88 | "Ripplefluke" [fillcolor="#ffff00"]; 89 | "Scabs" [fillcolor="#ffff00"]; 90 | "Shmuddel" -- "TR88" [w=1]; 91 | "Shmuddel" [fillcolor="#ffff00"]; 92 | "SMN5" [fillcolor="#ffff00"]; 93 | "SN100" -- "SN4" [w=1]; 94 | "SN100" -- "SN89" [w=1]; 95 | "SN100" [fillcolor="#ffff00"]; 96 | "SN4" -- "Zipfel" [w=1]; 97 | "SN4" [fillcolor="#ffff00"]; 98 | "SN63" -- "Whitetip" [w=1]; 99 | "SN63" [fillcolor="#ffff00"]; 100 | "SN89" [fillcolor="#ffff00"]; 101 | "SN9" [fillcolor="#ffff00"]; 102 | "SN90" [fillcolor="#ffff00"]; 103 | "SN96" [fillcolor="#ffff00"]; 104 | "Stripes" -- "TR120" [w=1]; 105 | "Stripes" -- "TSN83" [w=1]; 106 | "Stripes" [fillcolor="#ffff00"]; 107 | "Thumper" [fillcolor="#ffff00"]; 108 | "Topless" [fillcolor="#ffff00"]; 109 | "TR120" [fillcolor="#ffff00"]; 110 | "TR77" [fillcolor="#ffff00"]; 111 | "TR82" [fillcolor="#ffff00"]; 112 | "TR88" [fillcolor="#ffff00"]; 113 | "TR99" [fillcolor="#ffff00"]; 114 | "Trigger" [fillcolor="#ffff00"]; 115 | "TSN103" [fillcolor="#ffff00"]; 116 | "TSN83" [fillcolor="#ffff00"]; 117 | "Upbang" [fillcolor="#ffff00"]; 118 | "Vau" [fillcolor="#ffff00"]; 119 | "Wave" [fillcolor="#ffff00"]; 120 | "Web" [fillcolor="#ffff00"]; 121 | "Whitetip" [fillcolor="#ffff00"]; 122 | "Zap" [fillcolor="#ffff00"]; 123 | "Zig" [fillcolor="#ffff00"]; 124 | "Zipfel" [fillcolor="#ffff00"]; 125 | } 126 | -------------------------------------------------------------------------------- /lib/graph-utils/data/dolphins-cut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/dolphins-cut.png -------------------------------------------------------------------------------- /lib/graph-utils/data/dolphins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/dolphins.png -------------------------------------------------------------------------------- /lib/graph-utils/data/dolphins.txt: -------------------------------------------------------------------------------- 1 | The file dolphins.gml contains an undirected social network of frequent 2 | associations between 62 dolphins in a community living off Doubtful Sound, 3 | New Zealand, as compiled by Lusseau et al. (2003). Please cite 4 | 5 | D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and 6 | S. M. Dawson, The bottlenose dolphin community of Doubtful Sound features 7 | a large proportion of long-lasting associations, Behavioral Ecology and 8 | Sociobiology 54, 396-405 (2003). 9 | 10 | Additional information on the network can be found in 11 | 12 | D. Lusseau, The emergent properties of a dolphin social network, 13 | Proc. R. Soc. London B (suppl.) 270, S186-S188 (2003). 14 | 15 | D. Lusseau, Evidence for social role in a dolphin social network, 16 | Preprint q-bio/0607048 (http://arxiv.org/abs/q-bio.PE/0607048) 17 | -------------------------------------------------------------------------------- /lib/graph-utils/data/flow-big.net: -------------------------------------------------------------------------------- 1 | *Vertices 8 2 | 0 0 3 | 1 1 4 | 2 2 5 | 3 3 6 | 4 4 7 | 5 5 8 | 6 6 9 | 7 7 10 | *Arcs 11 | 0 1 7 12 | 0 3 2 13 | 1 2 4 14 | 2 3 4 15 | 2 5 2 16 | 3 4 4 17 | 4 5 8 18 | 6 7 2 19 | 2 6 10 20 | -------------------------------------------------------------------------------- /lib/graph-utils/data/flow-simple.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "0" -> "1" [w=1,label=1]; 6 | "0" -> "2" [w=1,label=1]; 7 | "0" -> "3" [w=1,label=1]; 8 | "0" [fillcolor="#ffff00"]; 9 | "1" -> "3" [w=1,label=1]; 10 | "1" [fillcolor="#ffff00"]; 11 | "2" -> "3" [w=1,label=1]; 12 | "2" [fillcolor="#ffff00"]; 13 | "3" [fillcolor="#ffff00"]; 14 | } 15 | -------------------------------------------------------------------------------- /lib/graph-utils/data/flow-simple.net: -------------------------------------------------------------------------------- 1 | *Vertices 4 2 | 0 0 3 | 1 1 4 | 2 2 5 | 3 3 6 | *Arcs 7 | 0 1 1 8 | 0 2 1 9 | 1 3 1 10 | 2 3 1 11 | 0 3 1 12 | -------------------------------------------------------------------------------- /lib/graph-utils/data/flow.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "0" -> "1" [w=7,label=7]; 6 | "0" -> "3" [w=2,label=2]; 7 | "0" [fillcolor="#ffff00"]; 8 | "1" -> "2" [w=4,label=4]; 9 | "1" [fillcolor="#ffff00"]; 10 | "2" -> "3" [w=4,label=4]; 11 | "2" -> "5" [w=2,label=2]; 12 | "2" -> "6" [w=10,label=10]; 13 | "2" [fillcolor="#ffff00"]; 14 | "3" -> "4" [w=4,label=4]; 15 | "3" [fillcolor="#ffff00"]; 16 | "4" -> "5" [w=8,label=8]; 17 | "4" [fillcolor="#ffff00"]; 18 | "5" [fillcolor="#ffff00"]; 19 | "6" -> "7" [w=2,label=2]; 20 | "6" [fillcolor="#ffff00"]; 21 | "7" [fillcolor="#ffff00"]; 22 | } 23 | -------------------------------------------------------------------------------- /lib/graph-utils/data/flow.net: -------------------------------------------------------------------------------- 1 | *Vertices 8 2 | 0 0 3 | 1 1 4 | 2 2 5 | 3 3 6 | 4 4 7 | 5 5 8 | 6 6 9 | 7 7 10 | *Arcs 11 | 0 1 7 12 | 0 3 2 13 | 1 2 4 14 | 2 3 4 15 | 2 5 2 16 | 3 4 4 17 | 4 5 8 18 | 6 7 2 19 | 2 6 10 20 | -------------------------------------------------------------------------------- /lib/graph-utils/data/flow1.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "0" -> "1" [w=1,label=1]; 6 | "0" -> "2" [w=2,label=2]; 7 | "0" -> "4" [w=1,label=1]; 8 | "0" -> "8" [w=2,label=2]; 9 | "0" -> "9" [w=3,label=3]; 10 | "0" [fillcolor="#ffff00"]; 11 | "1" -> "2" [w=1,label=1]; 12 | "1" [fillcolor="#ffff00"]; 13 | "2" -> "3" [w=3,label=3]; 14 | "2" [fillcolor="#ffff00"]; 15 | "3" -> "5" [w=4,label=4]; 16 | "3" [fillcolor="#ffff00"]; 17 | "4" -> "6" [w=1,label=1]; 18 | "4" [fillcolor="#ffff00"]; 19 | "5" [fillcolor="#ffff00"]; 20 | "6" -> "7" [w=1,label=1]; 21 | "6" [fillcolor="#ffff00"]; 22 | "7" -> "5" [w=2,label=2]; 23 | "7" [fillcolor="#ffff00"]; 24 | "8" -> "5" [w=1,label=1]; 25 | "8" -> "7" [w=1,label=1]; 26 | "8" [fillcolor="#ffff00"]; 27 | "9" -> "10" [w=3,label=3]; 28 | "9" [fillcolor="#ffff00"]; 29 | "10" -> "11" [w=3,label=3]; 30 | "10" [fillcolor="#ffff00"]; 31 | "11" -> "2" [w=1,label=1]; 32 | "11" -> "3" [w=1,label=1]; 33 | "11" -> "5" [w=1,label=1]; 34 | "11" [fillcolor="#ffff00"]; 35 | } 36 | -------------------------------------------------------------------------------- /lib/graph-utils/data/flow1.net: -------------------------------------------------------------------------------- 1 | *Vertices 12 2 | 0 0 3 | 1 1 4 | 2 2 5 | 3 3 6 | 4 4 7 | 5 5 8 | 6 6 9 | 7 7 10 | 8 8 11 | 9 9 12 | 10 10 13 | 11 11 14 | *Arcs 15 | 0 1 1 16 | 1 2 1 17 | 2 3 3 18 | 3 5 4 19 | 0 4 1 20 | 4 6 1 21 | 6 7 1 22 | 7 5 2 23 | 0 8 2 24 | 8 5 1 25 | 0 2 2 26 | 8 7 1 27 | 0 9 3 28 | 9 10 3 29 | 10 11 3 30 | 11 2 1 31 | 11 5 1 32 | 11 3 1 33 | -------------------------------------------------------------------------------- /lib/graph-utils/data/football-cut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/football-cut.png -------------------------------------------------------------------------------- /lib/graph-utils/data/football.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/football.png -------------------------------------------------------------------------------- /lib/graph-utils/data/football.txt: -------------------------------------------------------------------------------- 1 | The file football.gml contains the network of American football games 2 | between Division IA colleges during regular season Fall 2000, as compiled 3 | by M. Girvan and M. Newman. The nodes have values that indicate to which 4 | conferences they belong. The values are as follows: 5 | 6 | 0 = Atlantic Coast 7 | 1 = Big East 8 | 2 = Big Ten 9 | 3 = Big Twelve 10 | 4 = Conference USA 11 | 5 = Independents 12 | 6 = Mid-American 13 | 7 = Mountain West 14 | 8 = Pacific Ten 15 | 9 = Southeastern 16 | 10 = Sun Belt 17 | 11 = Western Athletic 18 | 19 | If you make use of these data, please cite M. Girvan and M. E. J. Newman, 20 | Community structure in social and biological networks, 21 | Proc. Natl. Acad. Sci. USA 99, 7821-7826 (2002). 22 | -------------------------------------------------------------------------------- /lib/graph-utils/data/karate-cut.dot: -------------------------------------------------------------------------------- 1 | graph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "26" -- "25" [w=1,label=1]; 6 | "26" [fillcolor="#ffff00"]; 7 | "25" -- "32" [w=1,label=1]; 8 | "25" [fillcolor="#ffff00"]; 9 | "6" -- "1" [w=1,label=1]; 10 | "6" -- "17" [w=1,label=1]; 11 | "6" [fillcolor="#ffff00"]; 12 | "18" -- "1" [w=1,label=1]; 13 | "18" [fillcolor="#ffff00"]; 14 | "7" -- "1" [w=1,label=1]; 15 | "7" [fillcolor="#ffff00"]; 16 | "17" [fillcolor="#ffff00"]; 17 | "28" -- "24" [w=1,label=1]; 18 | "28" [fillcolor="#ffff00"]; 19 | "8" -- "1" [w=1,label=1]; 20 | "8" [fillcolor="#ffff00"]; 21 | "27" -- "30" [w=1,label=1]; 22 | "27" [fillcolor="#ffff00"]; 23 | "11" -- "1" [w=1,label=1]; 24 | "11" [fillcolor="#ffff00"]; 25 | "9" -- "1" [w=1,label=1]; 26 | "9" -- "31" [w=1,label=1]; 27 | "9" [fillcolor="#ffff00"]; 28 | "10" -- "3" [w=1,label=1]; 29 | "10" [fillcolor="#ffff00"]; 30 | "21" -- "33" [w=1,label=1]; 31 | "21" [fillcolor="#ffff00"]; 32 | "1" -- "2" [w=1,label=1]; 33 | "1" -- "3" [w=1,label=1]; 34 | "1" -- "4" [w=1,label=1]; 35 | "1" -- "5" [w=1,label=1]; 36 | "1" -- "12" [w=1,label=1]; 37 | "1" -- "13" [w=1,label=1]; 38 | "1" -- "14" [w=1,label=1]; 39 | "1" -- "20" [w=1,label=1]; 40 | "1" -- "22" [w=1,label=1]; 41 | "1" [fillcolor="#ffff00"]; 42 | "19" -- "33" [w=1,label=1]; 43 | "19" [fillcolor="#ffff00"]; 44 | "32" -- "29" [w=1,label=1]; 45 | "32" [fillcolor="#ffff00"]; 46 | "20" [fillcolor="#ffff00"]; 47 | "14" [fillcolor="#ffff00"]; 48 | "31" [fillcolor="#ffff00"]; 49 | "29" [fillcolor="#ffff00"]; 50 | "2" [fillcolor="#ffff00"]; 51 | "30" -- "24" [w=1,label=1]; 52 | "30" [fillcolor="#ffff00"]; 53 | "13" [fillcolor="#ffff00"]; 54 | "24" -- "33" [w=1,label=1]; 55 | "24" -- "34" [w=1,label=1]; 56 | "24" [fillcolor="#ffff00"]; 57 | "12" [fillcolor="#ffff00"]; 58 | "3" -- "33" [w=1,label=1]; 59 | "3" [fillcolor="#ffff00"]; 60 | "23" -- "33" [w=1,label=1]; 61 | "23" [fillcolor="#ffff00"]; 62 | "34" [fillcolor="#ffff00"]; 63 | "22" [fillcolor="#ffff00"]; 64 | "4" [fillcolor="#ffff00"]; 65 | "33" -- "15" [w=1,label=1]; 66 | "33" -- "16" [w=1,label=1]; 67 | "33" [fillcolor="#ffff00"]; 68 | "16" [fillcolor="#ffff00"]; 69 | "5" [fillcolor="#ffff00"]; 70 | "15" [fillcolor="#ffff00"]; 71 | } 72 | -------------------------------------------------------------------------------- /lib/graph-utils/data/karate-cut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/karate-cut.png -------------------------------------------------------------------------------- /lib/graph-utils/data/karate.dot: -------------------------------------------------------------------------------- 1 | graph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "26" -- "24" [w=1,label=1]; 6 | "26" -- "25" [w=1,label=1]; 7 | "26" -- "32" [w=1,label=1]; 8 | "26" [fillcolor="#ffff00"]; 9 | "25" -- "28" [w=1,label=1]; 10 | "25" -- "32" [w=1,label=1]; 11 | "25" [fillcolor="#ffff00"]; 12 | "6" -- "1" [w=1,label=1]; 13 | "6" -- "7" [w=1,label=1]; 14 | "6" -- "11" [w=1,label=1]; 15 | "6" -- "17" [w=1,label=1]; 16 | "6" [fillcolor="#ffff00"]; 17 | "18" -- "1" [w=1,label=1]; 18 | "18" -- "2" [w=1,label=1]; 19 | "18" [fillcolor="#ffff00"]; 20 | "7" -- "1" [w=1,label=1]; 21 | "7" -- "5" [w=1,label=1]; 22 | "7" -- "17" [w=1,label=1]; 23 | "7" [fillcolor="#ffff00"]; 24 | "17" [fillcolor="#ffff00"]; 25 | "28" -- "3" [w=1,label=1]; 26 | "28" -- "24" [w=1,label=1]; 27 | "28" -- "34" [w=1,label=1]; 28 | "28" [fillcolor="#ffff00"]; 29 | "8" -- "1" [w=1,label=1]; 30 | "8" -- "2" [w=1,label=1]; 31 | "8" -- "3" [w=1,label=1]; 32 | "8" -- "4" [w=1,label=1]; 33 | "8" [fillcolor="#ffff00"]; 34 | "27" -- "30" [w=1,label=1]; 35 | "27" -- "34" [w=1,label=1]; 36 | "27" [fillcolor="#ffff00"]; 37 | "11" -- "1" [w=1,label=1]; 38 | "11" -- "5" [w=1,label=1]; 39 | "11" [fillcolor="#ffff00"]; 40 | "9" -- "1" [w=1,label=1]; 41 | "9" -- "3" [w=1,label=1]; 42 | "9" -- "31" [w=1,label=1]; 43 | "9" -- "33" [w=1,label=1]; 44 | "9" -- "34" [w=1,label=1]; 45 | "9" [fillcolor="#ffff00"]; 46 | "10" -- "3" [w=1,label=1]; 47 | "10" -- "34" [w=1,label=1]; 48 | "10" [fillcolor="#ffff00"]; 49 | "21" -- "33" [w=1,label=1]; 50 | "21" -- "34" [w=1,label=1]; 51 | "21" [fillcolor="#ffff00"]; 52 | "1" -- "2" [w=1,label=1]; 53 | "1" -- "3" [w=1,label=1]; 54 | "1" -- "4" [w=1,label=1]; 55 | "1" -- "5" [w=1,label=1]; 56 | "1" -- "12" [w=1,label=1]; 57 | "1" -- "13" [w=1,label=1]; 58 | "1" -- "14" [w=1,label=1]; 59 | "1" -- "20" [w=1,label=1]; 60 | "1" -- "22" [w=1,label=1]; 61 | "1" -- "32" [w=1,label=1]; 62 | "1" [fillcolor="#ffff00"]; 63 | "19" -- "33" [w=1,label=1]; 64 | "19" -- "34" [w=1,label=1]; 65 | "19" [fillcolor="#ffff00"]; 66 | "32" -- "29" [w=1,label=1]; 67 | "32" -- "33" [w=1,label=1]; 68 | "32" -- "34" [w=1,label=1]; 69 | "32" [fillcolor="#ffff00"]; 70 | "20" -- "2" [w=1,label=1]; 71 | "20" -- "34" [w=1,label=1]; 72 | "20" [fillcolor="#ffff00"]; 73 | "14" -- "2" [w=1,label=1]; 74 | "14" -- "3" [w=1,label=1]; 75 | "14" -- "4" [w=1,label=1]; 76 | "14" -- "34" [w=1,label=1]; 77 | "14" [fillcolor="#ffff00"]; 78 | "31" -- "2" [w=1,label=1]; 79 | "31" -- "33" [w=1,label=1]; 80 | "31" -- "34" [w=1,label=1]; 81 | "31" [fillcolor="#ffff00"]; 82 | "29" -- "3" [w=1,label=1]; 83 | "29" -- "34" [w=1,label=1]; 84 | "29" [fillcolor="#ffff00"]; 85 | "2" -- "3" [w=1,label=1]; 86 | "2" -- "4" [w=1,label=1]; 87 | "2" -- "22" [w=1,label=1]; 88 | "2" [fillcolor="#ffff00"]; 89 | "30" -- "24" [w=1,label=1]; 90 | "30" -- "33" [w=1,label=1]; 91 | "30" -- "34" [w=1,label=1]; 92 | "30" [fillcolor="#ffff00"]; 93 | "13" -- "4" [w=1,label=1]; 94 | "13" [fillcolor="#ffff00"]; 95 | "24" -- "33" [w=1,label=1]; 96 | "24" -- "34" [w=1,label=1]; 97 | "24" [fillcolor="#ffff00"]; 98 | "12" [fillcolor="#ffff00"]; 99 | "3" -- "4" [w=1,label=1]; 100 | "3" -- "33" [w=1,label=1]; 101 | "3" [fillcolor="#ffff00"]; 102 | "23" -- "33" [w=1,label=1]; 103 | "23" -- "34" [w=1,label=1]; 104 | "23" [fillcolor="#ffff00"]; 105 | "34" -- "15" [w=1,label=1]; 106 | "34" -- "16" [w=1,label=1]; 107 | "34" -- "33" [w=1,label=1]; 108 | "34" [fillcolor="#ffff00"]; 109 | "22" [fillcolor="#ffff00"]; 110 | "4" [fillcolor="#ffff00"]; 111 | "33" -- "15" [w=1,label=1]; 112 | "33" -- "16" [w=1,label=1]; 113 | "33" [fillcolor="#ffff00"]; 114 | "16" [fillcolor="#ffff00"]; 115 | "5" [fillcolor="#ffff00"]; 116 | "15" [fillcolor="#ffff00"]; 117 | } 118 | -------------------------------------------------------------------------------- /lib/graph-utils/data/karate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/karate.png -------------------------------------------------------------------------------- /lib/graph-utils/data/karate.txt: -------------------------------------------------------------------------------- 1 | The file karate.gml contains the network of friendships between the 34 2 | members of a karate club at a US university, as described by Wayne Zachary 3 | in 1977. If you use these data in your work, please cite W. W. Zachary, An 4 | information flow model for conflict and fission in small groups, Journal of 5 | Anthropological Research 33, 452-473 (1977). 6 | -------------------------------------------------------------------------------- /lib/graph-utils/data/matching.dot: -------------------------------------------------------------------------------- 1 | digraph graphutils 2 | { 3 | splines=true; 4 | node [ color = black, fillcolor = white, style = filled ]; 5 | "0" -> "1" [w=1,label=1]; 6 | "0" [fillcolor="#ffff00"]; 7 | "1" -> "SINK" [w=1,label=1]; 8 | "1" [fillcolor="#ffff00"]; 9 | "2" -> "3" [w=1,label=1]; 10 | "2" -> "5" [w=1,label=1]; 11 | "2" [fillcolor="#ffff00"]; 12 | "3" -> "SINK" [w=1,label=1]; 13 | "3" [fillcolor="#ffff00"]; 14 | "4" -> "1" [w=1,label=1]; 15 | "4" -> "5" [w=1,label=1]; 16 | "4" [fillcolor="#ffff00"]; 17 | "5" -> "SINK" [w=1,label=1]; 18 | "5" [fillcolor="#ffff00"]; 19 | "6" -> "5" [w=1,label=1]; 20 | "6" -> "7" [w=1,label=1]; 21 | "6" [fillcolor="#ffff00"]; 22 | "7" -> "SINK" [w=1,label=1]; 23 | "7" [fillcolor="#ffff00"]; 24 | "8" -> "9" [w=1,label=1]; 25 | "8" [fillcolor="#ffff00"]; 26 | "9" -> "SINK" [w=1,label=1]; 27 | "9" [fillcolor="#ffff00"]; 28 | "10" -> "1" [w=1,label=1]; 29 | "10" -> "9" [w=1,label=1]; 30 | "10" -> "11" [w=1,label=1]; 31 | "10" [fillcolor="#ffff00"]; 32 | "11" -> "SINK" [w=1,label=1]; 33 | "11" [fillcolor="#ffff00"]; 34 | "SOURCE" -> "0" [w=1,label=1]; 35 | "SOURCE" -> "2" [w=1,label=1]; 36 | "SOURCE" -> "4" [w=1,label=1]; 37 | "SOURCE" -> "6" [w=1,label=1]; 38 | "SOURCE" -> "8" [w=1,label=1]; 39 | "SOURCE" -> "10" [w=1,label=1]; 40 | "SOURCE" [fillcolor="#ffff00"]; 41 | "SINK" [fillcolor="#ffff00"]; 42 | } 43 | -------------------------------------------------------------------------------- /lib/graph-utils/data/netscience.txt: -------------------------------------------------------------------------------- 1 | The file netscience.gml contains a coauthorship network of scientists 2 | working on network theory and experiment, as compiled by M. Newman in May 3 | 2006. The network was compiled from the bibliographies of two review 4 | articles on networks, M. E. J. Newman, SIAM Review 45, 167-256 (2003) and 5 | S. Boccaletti et al., Physics Reports 424, 175-308 (2006), with a few 6 | additional references added by hand. The version given here contains all 7 | components of the network, for a total of 1589 scientists, and not just the 8 | largest component of 379 scientists previously published. The network is 9 | weighted, with weights assigned as described in M. E. J. Newman, 10 | Phys. Rev. E 64, 016132 (2001). 11 | 12 | If you make use of these data, please cite M. E. J. Newman, Finding 13 | community structure in networks using the eigenvectors of matrices, 14 | Preprint physics/0605087 (2006). 15 | -------------------------------------------------------------------------------- /lib/graph-utils/data/netscience.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/lib/graph-utils/data/netscience.zip -------------------------------------------------------------------------------- /lib/graph-utils/data/test.net: -------------------------------------------------------------------------------- 1 | *Vertices 6 2 | 0 0 3 | 1 1 4 | 2 2 5 | 3 3 6 | 4 4 7 | 5 5 8 | *Arcs 9 | 0 1 1 10 | 0 2 1 11 | 3 1 1 12 | 3 2 1 13 | 4 5 2 14 | -------------------------------------------------------------------------------- /lib/graph-utils/edge.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | (defun print-edge (edge stream depth) 4 | (declare (ignore depth)) 5 | (format stream "#" (edge-node1 edge) (edge-node2 edge))) 9 | 10 | (defstruct (edge 11 | (:print-function print-edge)) 12 | node1 13 | node2 14 | weight) 15 | 16 | ;; Node / edge index 17 | (defun sxhash-node (n) 18 | (sxhash (node-id n))) 19 | 20 | (sb-ext:define-hash-table-test node= sxhash-node) 21 | 22 | (defun make-node-table () 23 | (make-hash-table :test 'node=)) 24 | 25 | (defstruct (edge-index (:conc-name nil) 26 | (:constructor %make-edge-index)) 27 | table) 28 | 29 | (defun make-edge-index () 30 | (%make-edge-index :table (make-node-table))) 31 | 32 | (defmethod add-edge-to-index ((index edge-index) edge) 33 | (let ((table (or (gethash (edge-node1 edge) (table index)) 34 | (setf (gethash (edge-node1 edge) (table index)) 35 | (make-node-table))))) 36 | (setf (gethash (edge-node2 edge) table) edge))) 37 | 38 | (defun remove-edge-from-index (index edge) 39 | (let ((table (or (gethash (edge-node1 edge) (table index)) 40 | (setf (gethash (edge-node1 edge) (table index)) 41 | (make-node-table ))))) 42 | (remhash (edge-node2 edge) table))) 43 | 44 | (defun lookup-edge (index node1 node2) 45 | (let ((table (gethash node1 (table index)))) 46 | (when (hash-table-p table) 47 | (gethash node2 table)))) 48 | -------------------------------------------------------------------------------- /lib/graph-utils/edit-distance.lisp: -------------------------------------------------------------------------------- 1 | (in-package :graph-utils) 2 | -------------------------------------------------------------------------------- /lib/graph-utils/functor.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | (defstruct (functor 4 | (:constructor %make-functor) 5 | (:predicate functor?)) 6 | name fn clauses (lock (make-recursive-lock))) 7 | 8 | (defgeneric prolog-compile (functor)) 9 | 10 | (defun lookup-functor (name) 11 | (gethash name *user-functors*)) 12 | 13 | (defun make-functor (&key name clauses) 14 | (or (lookup-functor name) 15 | (let ((functor (%make-functor :name name :clauses clauses))) 16 | (with-recursive-lock-held ((functor-lock functor)) 17 | (prog1 18 | (setf (gethash name *user-functors*) functor) 19 | (prolog-compile functor)))))) 20 | 21 | (defun add-functor-clause (functor clause) 22 | (with-recursive-lock-held ((functor-lock functor)) 23 | #+sbcl 24 | (sb-ext:cas (cdr (last (functor-clauses functor))) 25 | (cdr (last (functor-clauses functor))) 26 | (list clause)) 27 | #-sbcl 28 | (setf (cdr (last (functor-clauses functor))) 29 | (list clause)) 30 | (prolog-compile functor)) 31 | (functor-clauses functor)) 32 | 33 | (defun delete-functor (functor) 34 | (remhash (functor-name functor) *user-functors*)) 35 | 36 | (defun reset-functor (functor) 37 | (with-recursive-lock-held ((functor-lock functor)) 38 | #+sbcl 39 | (sb-ext:cas (functor-clauses functor) (functor-clauses functor) nil) 40 | #-sbcl 41 | (setf (functor-clauses functor) nil) 42 | (prolog-compile functor)) 43 | nil) 44 | 45 | (defun get-functor-fn (functor-symbol) 46 | (let ((f (lookup-functor functor-symbol))) 47 | (when (functor? f) 48 | (functor-fn f)))) 49 | 50 | (defun set-functor-fn (functor-symbol fn) 51 | (let ((f (lookup-functor functor-symbol))) 52 | (when *prolog-trace* 53 | (format t "TRACE: set-functor-fn for ~A got ~A~%" functor-symbol f)) 54 | (if (functor? f) 55 | (setf (functor-fn f) fn) 56 | (error 'prolog-error 57 | :reason (format nil "unknown functor ~A" functor-symbol))))) 58 | -------------------------------------------------------------------------------- /lib/graph-utils/graph-generation.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | (defmethod check-degree ((graph graph) degree) 4 | (map-nodes #'(lambda (name id) 5 | (unless (= (degree graph id) degree) 6 | (format t "~A: ~A has degree ~A~%" 7 | id name (degree graph id)) 8 | id)) 9 | graph :collect? t :remove-nulls? t)) 10 | 11 | (defmethod generate-random-graph ((model (eql :viger-latapy)) (size integer) 12 | &key degree (name-fn #'princ-to-string) 13 | (swaps 20) (node-comparator 'equal) 14 | &allow-other-keys) 15 | "Generate a random, connected graph of SIZE nodes with average degree as 16 | close to DEGREE as possible. Method based on 17 | http://www-rp.lip6.fr/~latapy/Publis/random.pdf" 18 | (assert (and (integerp degree) (plusp degree))) 19 | (let* ((graph (make-graph :node-comparator node-comparator)) (queue nil)) 20 | (dotimes (i size) 21 | (push i queue) 22 | (add-node graph (funcall name-fn i))) 23 | (labels ((choose-node () 24 | (setq queue (sort queue #'< :key #'(lambda (id) 25 | (degree graph id)))) 26 | (let ((node (first queue))) 27 | (when node 28 | (if (>= (degree graph node) degree) 29 | (progn 30 | (pop queue) 31 | (choose-node)) 32 | node))))) 33 | (map-nodes (lambda (name id) 34 | (declare (ignore name)) 35 | (let ((difference (- degree (degree graph id)))) 36 | (if (> difference 0) 37 | (dotimes (i difference) 38 | (let ((end-point (choose-node))) 39 | (when end-point 40 | (add-edge graph id end-point)) 41 | (setq queue (remove id queue))))))) 42 | graph) 43 | (unless (and (>= (edge-count graph) (1- (node-count graph))) 44 | (null (zero-degree-nodes graph))) 45 | (error "Graph contains zero degree nodes!")) 46 | (let ((components (find-components graph))) 47 | (unless (= 1 (length components)) 48 | (error "Graph has more than one component!"))) 49 | (dotimes (swap swaps) 50 | (let ((edge1 (random-edge graph)) (edge2 (random-edge graph))) 51 | (swap-edges graph edge1 edge2) 52 | ;;(unless (= 1 (length (find-components graph))) 53 | ;; (error "Edge swap of ~A and ~A disconnected the graph" 54 | ;; edge1 edge2)) 55 | )) 56 | (unless (= 1 (length (find-components graph))) 57 | (error "Edge swaps disconnected the graph!")) 58 | graph))) 59 | 60 | (defmethod generate-random-graph ((model (eql :erdos-renyi)) (size integer) 61 | &key p (name-fn #'princ-to-string)) 62 | (let ((graph (make-graph))) 63 | (dotimes (i size) 64 | (add-node graph (funcall name-fn i))) 65 | (dotimes (i size) 66 | (loop for j from (1+ i) to (1- size) do 67 | (when (<= (random 1.0) p) 68 | (add-edge graph i j)))) 69 | graph)) 70 | 71 | (defmethod generate-random-graph ((model (eql :barabasi-albert)) 72 | (size integer) 73 | &key (saturation-point 0) 74 | (name-fn #'princ-to-string) 75 | &allow-other-keys) 76 | (when (< size 4) 77 | (error "Cannot generate a barabasi-albert graph of size less than 4")) 78 | (let ((graph (make-graph :saturation-point saturation-point)) 79 | (degree-table (make-array size 80 | :element-type 'integer 81 | :initial-element 0))) 82 | (dotimes (i 3) 83 | (add-node graph (funcall name-fn i))) 84 | (dotimes (i 3) 85 | (loop for j from (1+ i) to 2 do 86 | (incf (aref degree-table i)) 87 | (incf (aref degree-table j)) 88 | (add-edge graph i j))) 89 | (loop for i from 3 to (1- size) do 90 | (add-node graph (funcall name-fn i))) 91 | (loop for i from 3 to (1- size) do 92 | (loop for j from 0 to (1- i) do 93 | (when (/= i j) 94 | (when (not (and (> (s-point graph) 0) 95 | (>= (aref degree-table j) (s-point graph)))) 96 | (when (<= (random 1.0) 97 | ;; This is the traditional barabasi-albert 98 | ;; calculation: 99 | ;; (/ (aref degree-table j) (edge-count graph))) 100 | ;; This is what we used for Lab 2: 101 | (/ (1+ (aref degree-table j)) 102 | (+ (edge-count graph) (node-count graph)))) 103 | (incf (aref degree-table i)) 104 | (incf (aref degree-table j)) 105 | (add-edge graph i j)))))) 106 | graph)) 107 | -------------------------------------------------------------------------------- /lib/graph-utils/graph-package.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:cl-user) 2 | 3 | (defpackage #:graph-utils 4 | (:use #:cl #:cl-ppcre #:dso-lex #:yacc #:bordeaux-threads 5 | #+sbcl #:cl-skip-list) 6 | (:export #:make-graph 7 | #:make-typed-graph 8 | #:typed-graph? 9 | #:add-edge-type 10 | #:graph 11 | #:copy-graph 12 | #:graph= 13 | #:graph? 14 | #:directed? 15 | #:undirected? 16 | #:node-ids 17 | #:add-node 18 | #:delete-node 19 | #:adjust-adjacency-matrix 20 | #:lookup-node 21 | #:map-nodes 22 | #:list-nodes 23 | #:random-node-id 24 | #:random-node 25 | #:rename-node 26 | #:node-count 27 | #:leaves 28 | #:leaf? 29 | #:neighbors 30 | #:inbound-neighbors 31 | #:outbound-neighbors 32 | #:do-outbound-neighbors 33 | #:edges 34 | #:edge-exists? 35 | #:add-edge 36 | #:delete-edge 37 | #:map-edges 38 | #:list-edges 39 | #:edge-count 40 | #:edge-weight 41 | #:density 42 | #:degree 43 | #:in-degree 44 | #:out-degree 45 | #:degree-distribution 46 | #:in-degree-distribution 47 | #:find-shortest-path 48 | #:all-pairs-shortest-paths 49 | #:reconstruct-path-all-pairs 50 | #:calculate-shortest-paths 51 | #:spanning-tree 52 | #:distance-map 53 | #:find-components 54 | #:score-edges 55 | #:cluster 56 | #:minimal-cut 57 | #:minimal-cut! 58 | #:compute-maximum-flow 59 | #:compute-maximum-matching 60 | #:bipartite? 61 | #:visualize 62 | #:generate-random-graph 63 | #:compute-page-rank-distribution 64 | #:compute-page-rank 65 | #:compute-hub-authority-values 66 | #:compute-center-nodes 67 | #:parse-pajek 68 | #:parse-gml 69 | 70 | ;; Prolog 71 | #:def-global-prolog-functor 72 | #:def-prolog-compiler-macro 73 | #:compile-body 74 | #:args-of 75 | #:*prolog-global-functors* 76 | #:deref-exp 77 | #:unify 78 | #:select 79 | #:?- 80 | #:q- 81 | #:get-triples 82 | #:add-triple 83 | #:delete-triple 84 | #:make-triple 85 | #:lookup-triple 86 | #:subject 87 | #:predicate 88 | #:object 89 | #:weight 90 | #:var-deref 91 | #:replace-?-vars 92 | #:variables-in 93 | #:make-functor-symbol 94 | #:*trail* 95 | #:*var-counter* 96 | #:*functor* 97 | #:make-functor 98 | #:maybe-add-undo-bindings 99 | #:compile-clause 100 | #:show-prolog-vars 101 | #:prolog-error 102 | #:prolog-ignore 103 | #:delete-functor 104 | #:set-functor-fn 105 | #:*select-list* 106 | #:select-flat 107 | #:select-first 108 | #:do-query 109 | #:map-query 110 | #:valid-prolog-query? 111 | #:init-prolog 112 | #:*prolog-graph* 113 | #:*prolog-trace* 114 | #:trace-prolog 115 | #:untrace-prolog 116 | )) 117 | -------------------------------------------------------------------------------- /lib/graph-utils/graph-test.lisp: -------------------------------------------------------------------------------- 1 | (require 'asdf) 2 | (asdf:oos 'asdf:load-op 'graph-utils) 3 | (use-package :graph-utils) 4 | 5 | (defun graph-test-old () 6 | (dolist (file '("data/Dining.net" "data/karate.gml")) 7 | ;;"data/dolphins.gml" "data/adjnoun.gml" "data/football.gml")) 8 | (format t "Doing graph ~A~%" file) 9 | (let ((graph (if (cl-ppcre:scan "gml$" file) (parse-gml file) (parse-pajek file))) 10 | (stem (cl-ppcre:regex-replace-all "(\.(net|gml))$" file ""))) 11 | (with-open-file (out (format nil "~A.log" stem) 12 | :direction :output 13 | :if-exists :supersede 14 | :if-does-not-exist :create) 15 | (format out "Graph of ~A: ~A~%" stem graph) 16 | (format out " Density: ~F~%" (density graph)) 17 | (format out " Degree distribution:~%~{ ~A~%~}" (degree-distribution graph)) 18 | (when (directed? graph) 19 | (format out " In-degree distribution:~%~{ ~A~%~}" (in-degree-distribution graph))) 20 | (format out " Components:~%~{ C: ~A~^~%~}~%" (find-components graph)) 21 | (format out " Shortest paths:~%") 22 | (dolist (path (calculate-shortest-paths graph)) 23 | (format out " Path from ~A to ~A: " 24 | (lookup-node graph (first path)) (lookup-node graph (second path))) 25 | (dotimes (i (length (third path))) 26 | (format out "~A -> " (lookup-node graph (first (nth i (third path)))))) 27 | (format out "~A~%" (lookup-node graph (second path)))) 28 | (when (directed? graph) 29 | (let ((pr (compute-page-rank graph :k 2 :scaling-factor 0.9)) 30 | (page-rank nil)) 31 | (dotimes (i (node-count graph)) 32 | (push (list (lookup-node graph i) (aref pr i)) page-rank)) 33 | (format out " Page rank:~%") 34 | (dolist (r (sort page-rank #'> :key #'second)) 35 | (format out " ~10A ~2,10F~%" (first r) (second r)))) 36 | (format out " Hubs-authorities:~%") 37 | (multiple-value-bind (hubs auths) (compute-hub-authority-values graph :normalize? t) 38 | (format out " Hubs:~%") 39 | (dolist (h hubs) 40 | (format out " ~10A ~2,10F~%" (car h) (cdr h))) 41 | (format out " Authorities:~%") 42 | (dolist (a auths) 43 | (format out " ~10A ~2,10F~%" (car a) (cdr a))))) 44 | (visualize graph :file (format nil "~A.dot" stem) :render? t) 45 | (format out " Minimal cut (removed edges):~%~{ ~A~%~}~%" 46 | (mapcar #'(lambda (e) 47 | (format nil "~A -> ~A" (first e) (second e))) 48 | (minimal-cut! graph))) 49 | (visualize graph :file (format nil "~A-cut.dot" stem) :render? t))))) 50 | 51 | 52 | (graph-test-csc525) 53 | -------------------------------------------------------------------------------- /lib/graph-utils/graph-utils.asd: -------------------------------------------------------------------------------- 1 | ;; ASDF package description for graph -*- Lisp -*- 2 | 3 | (defpackage :graph-utils-system (:use :cl :asdf)) 4 | (in-package :graph-utils-system) 5 | 6 | (defsystem graph-utils 7 | :name "graph-utils" 8 | :maintainer "Kevin Raison" 9 | :author "Kevin Raison " 10 | :version "0.1" 11 | :description "Graph utilities." 12 | :depends-on (:cl-ppcre 13 | :dso-lex 14 | :yacc 15 | :trivial-shell 16 | :parse-number 17 | :alexandria 18 | #+sbcl :cl-skip-list 19 | :bordeaux-threads) 20 | :components ((:file "graph-package") 21 | (:file "fib-heap") 22 | (:file "queue" :depends-on ("graph-package")) 23 | (:file "utilities" :depends-on ("queue")) 24 | (:file "node" :depends-on ("utilities")) 25 | ;;(:file "edge" :depends-on ("node")) 26 | (:file "sparse-arrays" :depends-on ("utilities")) 27 | (:file "graph-class" :depends-on ("sparse-arrays" "node")) 28 | (:file "typed-edge-graph-class" :depends-on ("graph-class")) 29 | (:file "graph-methods" :depends-on ("queue" "graph-class" "fib-heap")) 30 | (:file "typed-edge-graph-methods" 31 | :depends-on ("typed-edge-graph-class" "graph-methods")) 32 | #+sbcl 33 | (:file "index" :depends-on ("typed-edge-graph-methods")) 34 | (:file "triples" :depends-on 35 | #+sbcl ("index") 36 | #-sbcl ("typed-edge-graph-methods")) 37 | (:file "functor" :depends-on ("triples")) 38 | (:file "prologc" :depends-on ("functor")) 39 | (:file "prolog-functors" :depends-on ("prologc")) 40 | (:file "maximum-flow" :depends-on ("graph-methods")) 41 | (:file "bipartite" :depends-on ("maximum-flow")) 42 | (:file "cut" :depends-on ("maximum-flow")) 43 | (:file "edit-distance" :depends-on ("graph-methods")) 44 | (:file "graph-generation" :depends-on ("graph-methods")) 45 | (:file "graph-visualization" :depends-on ("graph-methods")) 46 | (:file "parsers" :depends-on ("graph-methods")) 47 | (:file "tests" :depends-on 48 | ("parsers" "graph-visualization" "graph-generation" 49 | "maximum-flow" "bipartite" "cut" 50 | "edit-distance")))) 51 | -------------------------------------------------------------------------------- /lib/graph-utils/index.lisp: -------------------------------------------------------------------------------- 1 | (in-package :graph-utils) 2 | 3 | (defclass index () 4 | ((edge-type :initarg :edge-type :accessor edge-type) 5 | (key-equality-fn :initarg :key-equality-fn :accessor key-equality-fn) 6 | (value-equality-fn :initarg :value-equality-fn :accessor value-equality-fn) 7 | (index :initarg :index :accessor index))) 8 | 9 | (defclass unique-index (index) ()) 10 | 11 | (defclass ordered-index (index) 12 | ((ordering-fn :initarg :ordering-fn :accessor ordering-fn))) 13 | 14 | (defclass unique-ordered-index (ordered-index unique-index) 15 | ()) 16 | 17 | (defun make-index (&key (type 'index) (key-equality-fn 'equal) 18 | (value-equality-fn 'equal) ordering-fn edge-type) 19 | (unless edge-type 20 | (error "You must supply an edge-type to make-index")) 21 | (let ((idx (make-instance type 22 | :key-equality-fn key-equality-fn 23 | :value-equality-fn value-equality-fn 24 | :edge-type edge-type))) 25 | (case type 26 | (unique-ordered-index 27 | (setf (ordering-fn idx) ordering-fn 28 | (index idx) (cl-skip-list:make-skip-list 29 | :key-equal key-equality-fn 30 | :value-equal value-equality-fn 31 | :duplicates-allowed? nil 32 | :comparison ordering-fn))) 33 | (ordered-index 34 | (setf (ordering-fn idx) ordering-fn 35 | (index idx) (cl-skip-list:make-skip-list 36 | :key-equal key-equality-fn 37 | :value-equal value-equality-fn 38 | :duplicates-allowed? t 39 | :comparison ordering-fn))) 40 | ((index unique-index) 41 | (setf (index idx) (make-hash-table :test key-equality-fn :synchronized t))) 42 | (otherwise 43 | (error "Unknown index type ~A" type))) 44 | idx)) 45 | 46 | (defgeneric idx-insert (idx key value &key replace?)) 47 | (defgeneric idx-remove (idx key &optional value)) 48 | (defgeneric idx-get (idx key &key limit)) 49 | 50 | (defmethod idx-insert ((idx index) key value &key replace?) 51 | (declare (ignore replace?)) 52 | (pushnew value (gethash key (index idx)) :test (value-equality-fn idx))) 53 | 54 | (defmethod idx-remove ((idx index) key &optional value) 55 | (setf (gethash key (index idx)) 56 | (if value 57 | (delete value (gethash key (index idx)) :test (value-equality-fn idx)) 58 | nil))) 59 | 60 | (defmethod idx-get ((idx index) key &key limit) 61 | (if (integerp limit) 62 | (let ((r (gethash key (index idx)))) 63 | (if (> (length r) limit) 64 | (subseq r 0 limit) 65 | r)) 66 | (gethash key (index idx)))) 67 | 68 | (defmethod idx-insert ((idx unique-index) key value &key replace?) 69 | (if (and (gethash key (index idx)) (null replace?)) 70 | (error "~A is already populated" key) 71 | (setf (gethash key (index idx)) value))) 72 | 73 | (defmethod idx-remove ((idx unique-index) key &optional value) 74 | (setf (gethash key (index idx)) 75 | (if value 76 | (delete value (gethash key (index idx)) :test (value-equality-fn idx)) 77 | nil))) 78 | 79 | (defmethod idx-get ((idx unique-index) key &key limit) 80 | (if (integerp limit) 81 | (let ((r (gethash key (index idx)))) 82 | (if (> (length r) limit) 83 | (subseq r 0 limit) 84 | r)) 85 | (gethash key (index idx)))) 86 | 87 | (defmethod idx-insert ((idx ordered-index) key value &key replace?) 88 | (if (and replace? (skip-list-lookup (index idx) key)) 89 | (skip-list-replace-kv (index idx) key value) 90 | (skip-list-add (index idx) key value))) 91 | 92 | (defmethod idx-remove ((idx ordered-index) key &optional value) 93 | (skip-list-delete (index idx) key value)) 94 | 95 | (defmethod idx-get ((idx ordered-index) key &key limit) 96 | (declare (ignore limit)) 97 | (skip-list-fetch-all (index idx) key)) 98 | 99 | (defmethod idx-get ((idx unique-ordered-index) key &key limit) 100 | (declare (ignore limit)) 101 | (skip-list-lookup (index idx) key)) 102 | -------------------------------------------------------------------------------- /lib/graph-utils/node.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | (defun print-node (node stream depth) 4 | (declare (ignore depth)) 5 | (format stream "#" (node-value node))) 9 | 10 | (defstruct (node 11 | (:constructor %make-node) 12 | (:print-function print-node)) 13 | value 14 | id 15 | weight) 16 | 17 | (let ((id 0)) 18 | (defun next-node-id () 19 | (incf id))) 20 | 21 | (defun make-node (&key value id weight) 22 | (let ((id (or id (next-node-id)))) 23 | (%make-node :value value :weight weight :id id))) 24 | 25 | (defmethod node= (n1 n2) 26 | (= (node-id n1) (node-id n2))) 27 | 28 | (defmethod node-eql (n1 n2) 29 | (and (= (node-id n1) (node-id n2)) 30 | (eql (node-value n1) (node-value n2)))) 31 | 32 | (defmethod node-equal (n1 n2) 33 | (and (= (node-id n1) (node-id n2)) 34 | (equal (node-value n1) (node-value n2)))) 35 | 36 | (defmethod node-equalp (n1 n2) 37 | (and (= (node-id n1) (node-id n2)) 38 | (equalp (node-value n1) (node-value n2)))) 39 | 40 | -------------------------------------------------------------------------------- /lib/graph-utils/queue.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | ;; The following queueing code was borrowed and adapted from Russell & Norvig's 4 | ;; "Introduction to AI" 5 | (defun print-queue (q stream depth) 6 | (declare (ignore depth)) 7 | (format stream "~a" (queue-elements q))) 8 | 9 | (defstruct (queue 10 | (:print-function print-queue)) 11 | (key #'identity) 12 | (last nil) 13 | (elements nil)) 14 | 15 | (defun make-empty-queue () (make-queue)) 16 | 17 | (defun empty-queue? (q) 18 | (= (length (queue-elements q)) 0)) 19 | 20 | (defun queue-front (q) 21 | (elt (queue-elements q) 0)) 22 | 23 | (defun dequeue (q) 24 | (when (listp (queue-elements q)) 25 | (pop (queue-elements q)))) 26 | 27 | (defun enqueue (q &rest items) 28 | (cond ((null items) nil) 29 | ((or (null (queue-last q)) (null (queue-elements q))) 30 | (setf (queue-last q) (last items) 31 | (queue-elements q) (nconc (queue-elements q) items))) 32 | (t (setf (cdr (queue-last q)) items 33 | (queue-last q) (last items))))) 34 | 35 | (defun queue-length (q) 36 | (length (queue-elements q))) 37 | ;; End of adapted code 38 | -------------------------------------------------------------------------------- /lib/graph-utils/r-test.lisp: -------------------------------------------------------------------------------- 1 | (ql:quickload :graph-utils) 2 | (ql:quickload :uuid) 3 | (ql:quickload :cl-store) 4 | 5 | (use-package :graph-utils) 6 | 7 | (defun make-cprefs (products) 8 | (let ((plen (length products))) 9 | (dolist (cid (select-flat (?x) (q- ?x :is-a :customer))) 10 | (dotimes (i 20) 11 | (add-triple cid :wants (nth (random plen) products))) 12 | (dotimes (i 100) 13 | (add-triple cid :ranks (nth (random plen) products) (1- (random 2.0))))))) 14 | 15 | (defun make-r-graph (&key (user-count 1000) (product-count 10000)) 16 | (let ((uuids (loop for x from 0 below (+ user-count product-count) 17 | collecting (format nil "~A" (uuid:make-v1-uuid)))) 18 | (graph (make-typed-graph :initial-edge-types 19 | '(:is-a :wants :ranks :sim))) 20 | (products nil) (customers nil)) 21 | (init-prolog graph) 22 | (dotimes (i product-count) 23 | (let ((id (pop uuids))) 24 | (push id products) 25 | (add-triple id :is-a :product))) 26 | (dotimes (i (length products)) 27 | (loop for j from (1+ i) below (length products) do 28 | (unless (= i j) 29 | (let ((p1 (nth i products)) 30 | (p2 (nth j products))) 31 | (let ((sim (1- (random 2.0)))) 32 | (add-triple p1 :sim p2 sim) 33 | (add-triple p2 :sim p1 sim)))))) 34 | (dotimes (i user-count) 35 | (let ((id (pop uuids))) 36 | (push id customers) 37 | (add-triple id :is-a :customer))) 38 | (make-cprefs products) 39 | (cl-store:store graph "rgraph.dat") 40 | graph)) 41 | 42 | (defparameter *graph* (make-r-graph)) 43 | 44 | -------------------------------------------------------------------------------- /lib/graph-utils/tests.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | (defun simple-test () 4 | ;;(let ((graph (make-graph :directed? t))) 5 | (let ((graph (make-graph :directed? nil))) 6 | (add-node graph "node1") 7 | (add-node graph "node2") 8 | (add-node graph "node3") 9 | (add-node graph "node4") 10 | (add-edge graph "node1" "node2") 11 | graph)) 12 | 13 | (defun typed-test () 14 | (time 15 | (let ((graph (make-typed-graph))) 16 | (dotimes (i 1000) 17 | (add-edge-type graph (intern (format nil "~@R" (1+ i)) :keyword))) 18 | (dotimes (x 2500000) 19 | (add-node graph (format nil "~B" x))) 20 | (dotimes (x 10000) 21 | (dotimes (y 10000) 22 | (unless (= x y) 23 | (let ((r (random 3))) 24 | (when (or (= 1 r) (= 0 r)) 25 | (add-edge graph x y 26 | :edge-type 27 | (intern (format nil "~@R" (1+ (random 1000))) 28 | :keyword))))))) 29 | graph))) 30 | #| 31 | (let ((graph (make-typed-graph :initial-edge-types '(:is-a :has-name)))) 32 | (add-node graph "Person") 33 | (add-node graph "Human") 34 | (add-node graph "Kevin") 35 | (add-node graph "No one") 36 | (add-edge graph "Person" "Human" :edge-type :is-a) 37 | (add-edge graph "Person" "Kevin" :edge-type :has-name) 38 | (add-edge graph "No one" "Person" :edge-type :owns) 39 | (values graph 40 | (list-edges graph) 41 | (cons "Person" (neighbors graph "Person")) 42 | (cons "Person" (inbound-neighbors graph "Person")) 43 | (cons "Person" (outbound-neighbors graph "Person"))))) 44 | |# 45 | 46 | (defun run-tests () 47 | (let ((flow1 (parse-pajek "data/flow1.net")) 48 | (flow2 (parse-pajek "data/flow.net")) 49 | (flow-simple (parse-pajek "data/flow-simple.net")) 50 | (bgraph (parse-pajek "data/bipartite1.net")) 51 | (results nil)) 52 | (dolist (alg '(:edmond-karp :dinic :karzanov :goldberg-tarjan)) 53 | (let ((f (compute-maximum-flow flow-simple 0 3 :algorithm alg))) 54 | (dbg "graph flow-simple: ~A says max flow is ~D" alg f) 55 | (push (list alg :flow-simple f) results)) 56 | (let ((f (compute-maximum-flow flow1 0 5 :algorithm alg))) 57 | (dbg "graph flow1: ~A says max flow is ~D" alg f) 58 | (push (list alg :flow1 f) results)) 59 | (let ((f (compute-maximum-flow flow2 0 5 :algorithm alg))) 60 | (dbg "graph flow2: ~A says max flow is ~D" alg f) 61 | (push (list alg :flow2 f) results))) 62 | (dolist (alg '(:edmond-karp :dinic :karzanov :goldberg-tarjan)) 63 | (multiple-value-bind (p1 p2) (bipartite? bgraph :show-partitions? t) 64 | (let ((matching 65 | (sort 66 | (if (every #'evenp p1) 67 | (compute-maximum-matching bgraph p1 p2 :algorithm alg) 68 | (compute-maximum-matching bgraph p2 p1 :algorithm alg)) 69 | #'< :key 'first))) 70 | (dbg "graph bipartite1: ~A says matching is~% ~A" alg matching) 71 | (push (list alg :matching matching) results)))) 72 | results)) 73 | 74 | -------------------------------------------------------------------------------- /lib/graph-utils/triples.lisp: -------------------------------------------------------------------------------- 1 | (in-package :graph-utils) 2 | 3 | (defvar *prolog-graph* nil) 4 | 5 | (defun init-prolog (&optional graph) 6 | (cond ((typed-graph? graph) 7 | (setq *prolog-graph* graph)) 8 | ((null graph) 9 | (setq *prolog-graph* (make-typed-graph))) 10 | (t 11 | (error "Please supply a typed graph")))) 12 | 13 | (defstruct (triple 14 | (:constructor %make-triple) 15 | (:predicate triple?) 16 | (:conc-name nil)) 17 | subject predicate object (weight 1)) 18 | 19 | (defun lookup-triple (s p o) 20 | (let ((w (edge-exists? *prolog-graph* s o :edge-type p))) 21 | (when (and (numberp w) (/= 0 w)) 22 | (%make-triple :subject s 23 | :predicate p 24 | :object o 25 | :weight w)))) 26 | 27 | (defun make-triple (subject predicate object &optional weight) 28 | (or (lookup-triple subject predicate object) 29 | (progn 30 | (add-node *prolog-graph* subject) 31 | (add-node *prolog-graph* object) 32 | (add-edge-type *prolog-graph* predicate) 33 | (add-edge *prolog-graph* subject object :edge-type predicate :weight weight) 34 | (%make-triple :subject subject 35 | :predicate predicate 36 | :object object 37 | :weight weight)))) 38 | 39 | (defun add-triple (s p o &optional (w 1)) 40 | (make-triple s p o w)) 41 | 42 | (defun delete-triple (s p o) 43 | (when (edge-exists? *prolog-graph* s o :edge-type p) 44 | (delete-edge *prolog-graph* s o p))) 45 | 46 | (defun triple-equal (t1 t2) 47 | (and (funcall (comparator *prolog-graph*) (subject t1) (subject t2)) 48 | (eql (predicate t1) (predicate t2)) 49 | (funcall (comparator *prolog-graph*) (object t1) (object t2)))) 50 | 51 | (defun get-triples (&key s p o) 52 | (cond ((and s p o) 53 | (list (lookup-triple s p o))) 54 | ((and s o) 55 | (let ((triples nil)) 56 | (dolist (edge-type (edge-types *prolog-graph*)) 57 | (let ((triple (lookup-triple s edge-type o))) 58 | (when (triple? triple) 59 | (push triple triples)))) 60 | triples)) 61 | ((and s p) 62 | (let ((matrix (gethash p (matrix *prolog-graph*))) 63 | (n1 (lookup-node *prolog-graph* s))) 64 | (when (and matrix n1) 65 | (map-sarray-row 66 | #'(lambda (k v) 67 | (%make-triple :subject s 68 | :predicate p 69 | :object (gethash k (ids *prolog-graph*)) 70 | :weight v)) 71 | matrix n1)))) 72 | ((and p o) 73 | (let ((matrix (gethash p (matrix *prolog-graph*))) 74 | (n2 (lookup-node *prolog-graph* o))) 75 | (when (and matrix n2) 76 | (map-sarray-col 77 | #'(lambda (k v) 78 | (%make-triple :subject (gethash k (ids *prolog-graph*)) 79 | :predicate p 80 | :object o 81 | :weight v)) 82 | matrix n2)))) 83 | (s 84 | (mapcan #'(lambda (edge-type) 85 | (get-triples :s s :p edge-type)) 86 | (edge-types *prolog-graph*))) 87 | (p 88 | (map-edges #'(lambda (n1 n2 w edge-type) 89 | (%make-triple :subject (gethash n1 (ids *prolog-graph*)) 90 | :predicate edge-type 91 | :object (gethash n2 (ids *prolog-graph*)) 92 | :weight w)) 93 | *prolog-graph* :collect? t :edge-type p)) 94 | (o 95 | (mapcan #'(lambda (edge-type) 96 | (get-triples :o o :p edge-type)) 97 | (edge-types *prolog-graph*))) 98 | (t 99 | (map-edges #'(lambda (n1 n2 w edge-type) 100 | (%make-triple :subject (gethash n1 (ids *prolog-graph*)) 101 | :predicate edge-type 102 | :object (gethash n2 (ids *prolog-graph*)) 103 | :weight w)) 104 | *prolog-graph* :collect? t)))) 105 | 106 | #| 107 | (defun test-prolog () 108 | (init-prolog) 109 | (add-triple "Kevin" :likes "cats") 110 | (add-triple "Kevin" :likes "Dustie") 111 | (add-triple "Kevin" :hates "dogs") 112 | (add-triple "Dustie" :hates "dogs") 113 | (add-triple "Dustie" :likes "Kevin") 114 | (select (?x ?y) (q- ?x :likes ?y))) 115 | |# 116 | -------------------------------------------------------------------------------- /lib/graph-utils/typed-edge-graph-methods.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | (declaim (optimize (speed 3) (space 2))) 4 | 5 | (defmethod in-degree ((graph typed-graph) (node integer) &key edge-type) 6 | (let ((degree 0) (matrix (gethash edge-type (matrix graph)))) 7 | (map-sarray-col #'(lambda (i w) 8 | (declare (ignore i)) 9 | (when (> w 0) 10 | (incf degree))) 11 | matrix node) 12 | degree)) 13 | 14 | (defmethod in-degree ((graph typed-graph) node &key edge-type) 15 | (in-degree graph (lookup-node graph node) :edge-type edge-type)) 16 | 17 | (defmethod out-degree ((graph typed-graph) (node integer) &key edge-type) 18 | (let ((degree 0) (matrix (gethash edge-type (matrix graph)))) 19 | (map-sarray-row #'(lambda (i w) 20 | (declare (ignore i)) 21 | (when (> w 0) 22 | (incf degree))) 23 | matrix node) 24 | degree)) 25 | 26 | (defmethod out-degree ((graph typed-graph) node &key edge-type) 27 | (out-degree graph (lookup-node graph node) :edge-type edge-type)) 28 | 29 | (defmethod degree-distribution ((graph typed-graph) &key edge-type) 30 | (let ((dist nil) (matrix (gethash edge-type (matrix graph)))) 31 | (maphash #'(lambda (node id) 32 | (declare (ignore node)) 33 | (let ((degree 0)) 34 | (loop 35 | for i 36 | from 0 37 | to (1- (row-count matrix)) do 38 | (when (not (zerop (saref matrix id i))) 39 | (incf degree))) 40 | (if (assoc degree dist) 41 | (incf (cdr (assoc degree dist))) 42 | (push (cons degree 1) dist)))) 43 | (nodes graph)) 44 | (sort dist #'< :key 'car))) 45 | 46 | (defmethod in-degree-distribution ((graph typed-graph) &key edge-type) 47 | (let ((dist nil) (matrix (gethash edge-type (matrix graph)))) 48 | (maphash #'(lambda (node id) 49 | (declare (ignore node)) 50 | (let ((degree 0)) 51 | (loop 52 | for i 53 | from 0 54 | to (1- (col-count matrix)) do 55 | (when (not (zerop (saref matrix i id))) 56 | (incf degree))) 57 | (if (assoc degree dist) 58 | (incf (cdr (assoc degree dist))) 59 | (push (cons degree 1) dist)))) 60 | (nodes graph)) 61 | (sort dist #'< :key 'car))) 62 | 63 | (defmethod sim-rank ((graph typed-graph) n1 n2 &key edge-type) 64 | (declare (ignore n1 n2 edge-type)) 65 | ) 66 | -------------------------------------------------------------------------------- /lib/graph-utils/utilities.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:graph-utils) 2 | 3 | (defparameter *graph-utils-debug* t) 4 | ;; Prolog specials 5 | (defparameter *occurs-check* t) 6 | (defvar *trail* (make-array 200 :fill-pointer 0 :adjustable t)) 7 | (defvar *var-counter* 0 "Counter for generating variable names.") 8 | (defvar *functor* nil "The Prolog functor currently being compiled.") 9 | (defvar *select-list* nil "Accumulator for prolog selects.") 10 | (defvar *cont* nil "Continuation container for step-wise queries.") 11 | #+sbcl 12 | (defvar *prolog-global-functors* (make-hash-table :synchronized t)) 13 | #-sbcl 14 | (defvar *prolog-global-functors* (make-hash-table)) 15 | #+sbcl 16 | (defvar *user-functors* (make-hash-table :synchronized t :test 'eql)) 17 | #-sbcl 18 | (defvar *user-functors* (make-hash-table :test 'eql)) 19 | (defparameter *prolog-trace* nil) 20 | (alexandria:define-constant +unbound+ :unbound) 21 | (alexandria:define-constant +no-bindings+ '((t . t)) :test 'equalp) 22 | (alexandria:define-constant +fail+ nil) 23 | 24 | (defmacro with-gensyms (syms &body body) 25 | `(let ,(mapcar #'(lambda (s) 26 | `(,s (gensym))) 27 | syms) 28 | ,@body)) 29 | 30 | (defun dbg (control &rest args) 31 | "Debug output function" 32 | (when *graph-utils-debug* 33 | (apply #'format t control args) 34 | (terpri))) 35 | 36 | (defun sum (list) 37 | "Sum a list of numbers" 38 | (apply #'+ list)) 39 | 40 | (defun square (x) 41 | "Square a number" 42 | (* x x)) 43 | 44 | (defun flatten (x) 45 | (labels ((rec (x acc) 46 | (cond ((null x) acc) 47 | ((atom x) (cons x acc)) 48 | (t (rec (car x) (rec (cdr x) acc)))))) 49 | (rec x nil))) 50 | 51 | (defun reuse-cons (x y x-y) 52 | "Return (cons x y), or reuse x-y if it is equal to (cons x y)" 53 | (if (and (eql x (car x-y)) (eql y (cdr x-y))) 54 | x-y 55 | (cons x y))) 56 | 57 | (defun new-interned-symbol (&rest args) 58 | "Concatenate symbols or strings to form an interned symbol" 59 | (intern (format nil "~{~a~}" args))) 60 | 61 | (defun new-symbol (&rest args) 62 | "Concatenate symbols or strings to form an uninterned symbol" 63 | (make-symbol (format nil "~{~a~}" args))) 64 | 65 | (defun length=1 (list) 66 | "Is this a list of exactly one element?" 67 | (and (consp list) (null (cdr list)))) 68 | 69 | (defun proper-listp (x) 70 | "Is x a proper (non-dotted) list?" 71 | (or (null x) 72 | (and (consp x) (proper-listp (rest x))))) 73 | 74 | (defun find-all (item sequence &rest keyword-args 75 | &key (test #'eql) test-not &allow-other-keys) 76 | "Find all those elements of sequence that match item, 77 | according to the keywords. Doesn't alter sequence." 78 | (if test-not 79 | (apply #'remove item sequence 80 | :test-not (complement test-not) keyword-args) 81 | (apply #'remove item sequence 82 | :test (complement test) keyword-args))) 83 | 84 | (defun find-anywhere (item tree) 85 | "Does item occur anywhere in tree? If so, return it." 86 | (cond ((eql item tree) tree) 87 | ((atom tree) nil) 88 | ((find-anywhere item (first tree))) 89 | ((find-anywhere item (rest tree))))) 90 | 91 | (defun unique-find-anywhere-if (predicate tree &optional found-so-far) 92 | "return a list of leaves of tree satisfying predicate, with duplicates removed." 93 | (if (atom tree) 94 | (if (funcall predicate tree) 95 | (adjoin tree found-so-far) 96 | found-so-far) 97 | (unique-find-anywhere-if 98 | predicate 99 | (first tree) 100 | (unique-find-anywhere-if predicate (rest tree) found-so-far)))) 101 | 102 | (defun find-if-anywhere (predicate tree) 103 | "Does predicate apply to any atom in the tree?" 104 | (if (atom tree) 105 | (funcall predicate tree) 106 | (or (find-if-anywhere predicate (first tree)) 107 | (find-if-anywhere predicate (rest tree))))) 108 | 109 | (defun continue-p () 110 | "Ask user if we should continue looking for solutions." 111 | (case (read-char) 112 | (#\; t) 113 | (#\. nil) 114 | (#\newline (continue-p)) 115 | (otherwise 116 | (format t " Type ; to see more or . to stop") 117 | (continue-p)))) 118 | -------------------------------------------------------------------------------- /lib/porter-stemmer/package.lisp: -------------------------------------------------------------------------------- 1 | (defpackage :porter-stemmer 2 | (:use :cl) 3 | (:export #:stem #:unstem #:*keep-unstems-p*)) 4 | -------------------------------------------------------------------------------- /lib/porter-stemmer/porter-stemmer.asd: -------------------------------------------------------------------------------- 1 | ;;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Base: 10 -*- 2 | 3 | (defsystem :porter-stemmer 4 | :components ((:file "package") 5 | (:file "stemmer" :depends-on ("package")))) 6 | -------------------------------------------------------------------------------- /libstemmer.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | ;;; This file was automatically generated by SWIG (http://www.swig.org). 3 | ;;; Version 2.0.11 4 | ;;; 5 | ;;; Do not make changes to this file unless you know what you are doing--modify 6 | ;;; the SWIG interface file instead. 7 | 8 | 9 | ;;;SWIG wrapper code starts here 10 | 11 | (cl:defmacro defanonenum (cl:&body enums) 12 | "Converts anonymous enums to defconstants." 13 | `(cl:progn ,@(cl:loop for value in enums 14 | for index = 0 then (cl:1+ index) 15 | when (cl:listp value) do (cl:setf index (cl:second value) 16 | value (cl:first value)) 17 | collect `(cl:defconstant ,value ,index)))) 18 | 19 | (cl:eval-when (:compile-toplevel :load-toplevel) 20 | (cl:unless (cl:fboundp 'swig-lispify) 21 | (cl:defun swig-lispify (name flag cl:&optional (package cl:*package*)) 22 | (cl:labels ((helper (lst last rest cl:&aux (c (cl:car lst))) 23 | (cl:cond 24 | ((cl:null lst) 25 | rest) 26 | ((cl:upper-case-p c) 27 | (helper (cl:cdr lst) 'upper 28 | (cl:case last 29 | ((lower digit) (cl:list* c #\- rest)) 30 | (cl:t (cl:cons c rest))))) 31 | ((cl:lower-case-p c) 32 | (helper (cl:cdr lst) 'lower (cl:cons (cl:char-upcase c) rest))) 33 | ((cl:digit-char-p c) 34 | (helper (cl:cdr lst) 'digit 35 | (cl:case last 36 | ((upper lower) (cl:list* c #\- rest)) 37 | (cl:t (cl:cons c rest))))) 38 | ((cl:char-equal c #\_) 39 | (helper (cl:cdr lst) '_ (cl:cons #\- rest))) 40 | (cl:t 41 | (cl:error "Invalid character: ~A" c))))) 42 | (cl:let ((fix (cl:case flag 43 | ((constant enumvalue) "+") 44 | (variable "*") 45 | (cl:t "")))) 46 | (cl:intern 47 | (cl:concatenate 48 | 'cl:string 49 | fix 50 | (cl:nreverse (helper (cl:concatenate 'cl:list name) cl:nil cl:nil)) 51 | fix) 52 | package)))))) 53 | 54 | ;;;SWIG wrapper code ends here 55 | (cffi:define-foreign-library libstemmer 56 | (:unix (:or "/usr/lib/libstemmer.so" "/usr/lib/x86_64-linux-gnu/libstemmer.so")) 57 | (t (:default (:or "/usr/lib/libstemmer.so" "/usr/lib/x86_64-linux-gnu/libstemmer.so")))) 58 | 59 | (cffi:use-foreign-library libstemmer) 60 | 61 | 62 | (cffi:defcfun ("sb_stemmer_list" sb_stemmer_list) :pointer) 63 | 64 | (cffi:defcfun ("sb_stemmer_new" sb_stemmer_new) :pointer 65 | (algorithm :string) 66 | (charenc :string)) 67 | 68 | (cffi:defcfun ("sb_stemmer_delete" sb_stemmer_delete) :void 69 | (stemmer :pointer)) 70 | 71 | (cffi:defcfun ("sb_stemmer_stem" sb_stemmer_stem) :pointer 72 | (stemmer :pointer) 73 | (word :pointer) 74 | (size :int)) 75 | 76 | (cffi:defcfun ("sb_stemmer_length" sb_stemmer_length) :int 77 | (stemmer :pointer)) 78 | -------------------------------------------------------------------------------- /ngrams.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defclass ngram-node () 4 | ((word :accessor word :initform nil :initarg :word) 5 | (child-words :accessor child-words :initform (make-hash-table :test 'equal) 6 | :initarg :child-words) 7 | (value :accessor value :initform nil :initarg :value))) 8 | 9 | (defmethod corpus-ngram-p ((language language) text) 10 | (scan "^(\\w+_\\w+)+$" text)) 11 | 12 | (defmethod ngram-components ((language language) text) 13 | (split "_" text)) 14 | 15 | (defmethod print-object ((node ngram-node) stream) 16 | (print-unreadable-object (node stream :type t) 17 | (format stream "'~A'" (word node)) 18 | (when (value node) 19 | (format stream ": ~S" (value node))))) 20 | 21 | (defun make-ngram-tree () 22 | (make-instance 'ngram-node :word :root)) 23 | 24 | (defun child-word-p (parent word) 25 | (gethash word (child-words parent))) 26 | 27 | (defun maybe-add-ngram-node (parent word) 28 | (or (child-word-p parent word) 29 | (let ((node (make-instance 'ngram-node :word word))) 30 | (setf (gethash word (child-words parent)) node) 31 | node))) 32 | 33 | (defun add-ngram (tree string part-of-speech) 34 | (let ((ngram (split "_" string)) 35 | (parent tree)) 36 | (dotimes (i (length ngram)) 37 | (let ((node (maybe-add-ngram-node parent (elt ngram i)))) 38 | (if (= i (- (length ngram) 1)) 39 | (pushnew part-of-speech (value node)) 40 | (setq parent node)))) 41 | tree)) 42 | 43 | (defun ngram-p (tree words) 44 | (labels 45 | ((walk-tree (parent word-list) 46 | (if (null word-list) 47 | (value parent) 48 | (let ((child (child-word-p parent (first word-list)))) 49 | (when child 50 | (walk-tree child (rest word-list))))))) 51 | (walk-tree tree words))) 52 | 53 | (defmethod extract-skip-bigrams ((language nlp:language) (tokens sequence) 54 | &key (skip 2) stem-p) 55 | (let ((skip-grams (make-hash-table :test 'equalp)) 56 | (tokens-length (length tokens))) 57 | (loop for i from 0 below tokens-length do 58 | (loop for j from (1+ i) to (+ 1 skip i) do 59 | (unless (>= j tokens-length) 60 | (let ((token1 (if stem-p 61 | (nlp:stem language (elt tokens i)) 62 | (elt tokens i))) 63 | (token2 (if stem-p 64 | (nlp:stem language (elt tokens j)) 65 | (elt tokens j)))) 66 | (unless (equalp token1 token2) 67 | (incf (gethash 68 | (if (string-lessp token1 token2) 69 | (list token1 token2) 70 | (list token2 token1)) 71 | skip-grams 0))))))) 72 | skip-grams)) 73 | 74 | (defmethod extract-skip-trigrams ((language nlp:language) (tokens sequence) 75 | &key (skip 2) stem-p) 76 | (let ((skip-grams (make-hash-table :test 'equalp)) 77 | (tokens-length (length tokens))) 78 | (loop for i from 0 below tokens-length do 79 | (loop for j from (1+ i) to (+ 1 skip i) do 80 | (unless (>= j tokens-length) 81 | (loop for k from (1+ j) to (+ 1 skip j) do 82 | (unless (>= k tokens-length) 83 | (let ((token1 (if stem-p 84 | (nlp:stem language (elt tokens i)) 85 | (elt tokens i))) 86 | (token2 (if stem-p 87 | (nlp:stem language (elt tokens j)) 88 | (elt tokens j))) 89 | (token3 (if stem-p 90 | (nlp:stem language (elt tokens k)) 91 | (elt tokens k)))) 92 | (unless (or (equalp token1 token2) 93 | (equalp token1 token3) 94 | (equalp token2 token3)) 95 | (let ((key (sort (list token1 token2 token3) 'string-lessp))) 96 | ;;(let ((key (list token1 token2 token3))) 97 | (incf (gethash key skip-grams 0)))))))))) 98 | skip-grams)) 99 | 100 | (defmethod extract-ngrams ((tokens sequence) &key include-quadrigrams-p) 101 | (let ((sentence-length (length tokens)) 102 | (total-unigrams 0) 103 | (total-bigrams 0) 104 | (total-trigrams 0) 105 | (total-quadrigrams 0) 106 | (ngram-table (make-hash-table :test 'equalp))) 107 | (loop for i from 0 below sentence-length do 108 | ;; get the unigrams 109 | (incf total-unigrams) 110 | (incf (gethash (list (elt tokens i)) ngram-table 0)) 111 | ;; get the bigrams 112 | (when (< i (- sentence-length 1)) 113 | (incf total-bigrams) 114 | (incf (gethash (list (elt tokens i) 115 | (elt tokens (1+ i))) 116 | ngram-table 0)) 117 | ;; get the trigrams 118 | (when (< i (- sentence-length 2)) 119 | (incf total-trigrams) 120 | (incf (gethash (list (elt tokens i) 121 | (elt tokens (1+ i)) 122 | (elt tokens (+ 2 i))) 123 | ngram-table 0)) 124 | ;; get the quadrigrams if requested 125 | (when (and include-quadrigrams-p 126 | (< i (- sentence-length 3))) 127 | (incf total-quadrigrams) 128 | (incf (gethash (list (elt tokens i) 129 | (elt tokens (1+ i)) 130 | (elt tokens (+ 2 i)) 131 | (elt tokens (+ 3 i))) 132 | ngram-table 0)))))) 133 | (values ngram-table 134 | total-unigrams 135 | total-bigrams 136 | total-trigrams 137 | total-quadrigrams))) 138 | -------------------------------------------------------------------------------- /nlp-package.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:cl-user) 2 | 3 | (defpackage #:nlp 4 | (:use #:cl #:kyoto-cabinet #:cl-ppcre #:parse-number #:dso-lex #:yacc 5 | #:graph-utils #:cffi-wordnet) 6 | (:export #:*language* 7 | #:*languages* 8 | #:language 9 | #:language-p 10 | #:freeze-nlp 11 | #:thaw-nlp 12 | #:lookup-language 13 | #:close-language 14 | #:make-language 15 | #:make-english-db 16 | #:make-spanish-db 17 | #:make-portuguese-db 18 | #:name 19 | #:default-encoding 20 | 21 | ;; Language classes we support 22 | #:english 23 | #:spanish 24 | #:portuguese 25 | #:italian 26 | #:german 27 | #:french 28 | 29 | #:edit-distance 30 | #:longest-common-subseq 31 | #:split-sentences 32 | #:tokenize 33 | #:train-tagger 34 | #:tag-sentence 35 | #:tag 36 | #:tag-as-text 37 | #:noun-p 38 | #:verb-p 39 | #:adjective-p 40 | #:punctuation-p 41 | 42 | #:earley-parse 43 | #:chart-parse 44 | #:cyk-parse 45 | #:pcp-parse 46 | #:p-chart-parse 47 | #:extract-phrases 48 | #:all-phrases 49 | #:train-phrase-extractor 50 | #:singularize 51 | #:extract-ngrams 52 | #:extract-skip-bigrams 53 | #:extract-skip-trigrams 54 | 55 | #:in-lexicon-p 56 | #:add-to-lexicon 57 | #:lookup-pos 58 | #:possible-tags 59 | #:pos-similarity 60 | #:tree-similarity 61 | #:pos-edit-distance 62 | 63 | #:spell-check 64 | #:correct-spelling 65 | 66 | #:join 67 | #:stem 68 | #:add-stop-word 69 | #:remove-stop-word 70 | #:stop-word-p 71 | #:remove-stop-words 72 | #:wildcard-stop-words 73 | 74 | #:synset 75 | #:singularize 76 | #:verb-base-form 77 | #:synonyms 78 | #:synonym-p 79 | #:hypernyms 80 | #:meronyms 81 | #:instances 82 | #:holonyms 83 | #:hyponyms 84 | #:semantic-neighborhood 85 | #:semantic-parents 86 | #:semantic-children 87 | #:glosses 88 | #:wordnet-pos 89 | #:synset-word-list 90 | #:synset-type 91 | 92 | #:semantic-similarity 93 | #:semantic-distance 94 | #:negative-squared-semantic-distance 95 | #:clear-semantic-caches 96 | 97 | #:flatten 98 | #:total-word-count 99 | #:word-occurrence 100 | 101 | #:+punctuation-tag+ 102 | #:+foreign-tag+ 103 | #:+adjective-tag+ 104 | #:+conjunction-tag+ 105 | #:+determiner-tag+ 106 | #:+noun-tag+ 107 | #:+pronoun-tag+ 108 | #:+adverb-tag+ 109 | #:+adposition-tag+ 110 | #:+verb-tag+ 111 | #:+number-tag+ 112 | #:+particle-tag+ 113 | )) 114 | -------------------------------------------------------------------------------- /nlp.asd: -------------------------------------------------------------------------------- 1 | ;; ASDF package description for nlp -*- Lisp -*- 2 | 3 | (defpackage :nlp-system (:use :cl :asdf)) 4 | (in-package :nlp-system) 5 | 6 | (defsystem nlp 7 | :name "cl-nlp" 8 | :maintainer "Kevin Raison" 9 | :author "Kevin Raison " 10 | :version "2.0" 11 | :description "Multi-language Natural Language Processing Utilities" 12 | :long-description "Multi-language Natural Language Processing Utilities." 13 | :depends-on (:cl-ppcre 14 | :babel 15 | :cl-kyoto-cabinet 16 | :cffi 17 | :cl-store 18 | :parse-number 19 | :dso-lex 20 | :yacc 21 | :alexandria 22 | :porter-stemmer 23 | #+sbcl :sb-concurrency 24 | :cl-heap 25 | :rcl 26 | :log4cl 27 | :graph-utils 28 | :cffi-wordnet 29 | :cl-fad 30 | :s-xml 31 | :cxml 32 | :split-sequence) 33 | :components ((:file "nlp-package") 34 | (:file "globals" :depends-on ("nlp-package")) 35 | (:file "pos-symbols" :depends-on ("globals")) 36 | (:file "utilities" :depends-on ("pos-symbols")) 37 | (:file "db" :depends-on ("utilities")) 38 | (:file "kyoto-lexicon" :depends-on ("db")) 39 | (:file "db-methods" :depends-on ("kyoto-lexicon")) 40 | (:file "corpora" :depends-on ("utilities")) 41 | (:file "pos-map" :depends-on ("corpora")) 42 | 43 | (:file "read-cess-esp" :depends-on ("corpora")) 44 | (:file "read-ancora-es" :depends-on ("corpora")) 45 | (:file "read-wikicorpus-es" :depends-on ("corpora")) 46 | (:file "read-tut" :depends-on ("corpora")) 47 | (:file "read-paisa" :depends-on ("corpora")) 48 | (:file "read-floresta" :depends-on ("corpora")) 49 | (:file "read-tiger" :depends-on ("corpora")) 50 | 51 | (:file "base-forms" :depends-on ("globals")) 52 | (:file "lexicon" :depends-on ("kyoto-lexicon")) 53 | (:file "stop-words" :depends-on ("db-methods")) 54 | (:file "sentence-splitter" :depends-on ("utilities")) 55 | (:file "libstemmer" :depends-on ("db-methods")) 56 | (:file "stemmer" :depends-on ("libstemmer")) 57 | (:file "edit-distance" :depends-on ("utilities")) 58 | (:file "ngrams" :depends-on ("utilities")) 59 | (:file "wordnet" :depends-on ("db-methods")) 60 | (:file "semantic-similarity" :depends-on ("wordnet")) 61 | (:file "semantic-similarity-li" :depends-on ("semantic-similarity")) 62 | (:file "semantic-similarity-raison" :depends-on 63 | ("semantic-similarity-li")) 64 | (:file "pos-tag" :depends-on 65 | ("corpora" "lexicon" "db-methods" "sentence-splitter")) 66 | (:file "grammar" :depends-on ("corpora" "lexicon" "db-methods")) 67 | (:file "parser" :depends-on ("grammar" "pos-tag")) 68 | (:file "chunker" :depends-on ("grammar" "pos-tag")) 69 | (:file "prob-parser" :depends-on ("parser")) 70 | (:file "pcp" :depends-on ("parser")) 71 | (:file "initialize" :depends-on 72 | ("stop-words" "prob-parser" "pcp" "wordnet" "semantic-similarity-raison")) 73 | (:file "english" :depends-on ("initialize" "stemmer")) 74 | (:file "english-lexicon" :depends-on ("english")) 75 | (:file "spanish" :depends-on 76 | ("initialize" "read-cess-esp" "read-ancora-es" 77 | "read-wikicorpus-es" "stemmer")) 78 | (:file "spanish-lexicon" :depends-on ("spanish")) 79 | (:file "italian" :depends-on 80 | ("initialize" "read-tut" "read-paisa" "stemmer")) 81 | (:file "portuguese" :depends-on 82 | ("initialize" "read-floresta" "stemmer")) 83 | (:file "german" :depends-on 84 | ("initialize" "read-tiger" "stemmer")) 85 | (:file "french" :depends-on 86 | ("initialize" "stemmer")) 87 | (:file "french-lexicon" :depends-on ("french")) 88 | )) 89 | -------------------------------------------------------------------------------- /parsing.lisp: -------------------------------------------------------------------------------- 1 | (ql:quickload :nlp) 2 | (ql:quickload "trivial-timeout") 3 | 4 | (in-package :nlp) 5 | 6 | (init-nlp "nlp.dat") 7 | 8 | (defun tomuro-cfg (&optional (corpus "data/all-parsed.txt")) 9 | (load-cfg "data/grammar.txt" *pos-db*) 10 | (with-open-file (log "tomuro.log" 11 | :direction :output 12 | :if-exists :supersede) 13 | (map-sexp-corpus 14 | (lambda (tree) 15 | (let ((words (syntax-leaves tree))) 16 | (format t "~A~%" words) 17 | (format log "~A~%" words) 18 | (multiple-value-bind (parse tags) 19 | (handler-case 20 | (trivial-timeout:with-timeout (10) 21 | (chart-parse (butlast words))) 22 | (error (c) 23 | (declare (ignore c)) 24 | #+sbcl (sb-ext:gc :full t) 25 | (values nil (possible-tags words)))) 26 | (format t "~A~%~A~%~%" tags parse) 27 | (format log "~A~%~A~%~%" tags parse)))) 28 | corpus))) 29 | 30 | (tomuro-cfg) 31 | (quit) 32 | -------------------------------------------------------------------------------- /pcfg.lisp: -------------------------------------------------------------------------------- 1 | (in-package #:nlp) 2 | 3 | (defun make-pcfg-from-cfg (file &optional pos-db) 4 | (let ((rules (make-hash-table)) 5 | (s-count (make-hash-table)) 6 | (probabilities (make-hash-table)) 7 | (index (make-hash-table))) 8 | (add-rule :start '(S) rules) 9 | (init-or-increment s-count :start) 10 | (with-open-file (in file) 11 | (do ((line (read-line in nil nil) (read-line in nil nil))) 12 | ((null line)) 13 | (when (> (length line) 1) 14 | (destructuring-bind (lhs rhs) (split "\\s*->\\s*" line) 15 | (let ((lhs (intern lhs)) 16 | (rhs (mapcar #'intern (split "\\s+" rhs)))) 17 | (init-or-increment s-count lhs) 18 | (add-rule lhs rhs rules)))))) 19 | (maphash #'(lambda (lhs rhs) 20 | (unless (hash-table-p (gethash lhs probabilities)) 21 | (setf (gethash lhs probabilities) (make-hash-table :test 'equalp))) 22 | (dolist (rhs rhs) 23 | (setf (gethash rhs (gethash lhs probabilities)) 24 | (/ 1 (gethash lhs s-count)))) 25 | (if (listp rhs) 26 | (dolist (p rhs) 27 | (if (listp p) 28 | (add-rule (first p) lhs index) 29 | (add-rule p lhs index))) 30 | (add-rule rhs lhs index))) 31 | rules) 32 | (if (pos-db? pos-db) 33 | (values (setf (pos-cfg pos-db) rules) 34 | (setf (pos-cfg-idx pos-db) index) 35 | (setf (pos-pcfg pos-db) probabilities)) 36 | (values rules index probabilities)))) 37 | -------------------------------------------------------------------------------- /portuguese.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defclass portuguese (language) 4 | ((name :accessor name :initform "portuguese" :initarg :name) 5 | (aliases :accessor aliases 6 | :initform '("pt" "pt-br" "pt-pt") 7 | :initarg :aliases) 8 | (vowels :accessor vowels :initform '(#\a #\e #\i #\o #\u #\â #\á #\ã #\ê #\é #\ó #\ô) 9 | :initarg :vowels) 10 | (default-encoding :accessor default-encoding :initform :latin-1 11 | :initarg :default-encoding))) 12 | 13 | (defmethod noun-p ((language portuguese) symbol) 14 | (member symbol '(:N :PROP))) 15 | 16 | (defmethod adjective-p ((language portuguese) symbol) 17 | (member symbol '(:ADJ))) 18 | 19 | (defmethod verb-p ((language portuguese) symbol) 20 | (member symbol '(:V))) 21 | 22 | (defmethod adverb-p ((language portuguese) symbol) 23 | (member symbol '(:ADV))) 24 | 25 | (defmethod determiner-p ((language portuguese) symbol) 26 | (member symbol '(:DET))) 27 | 28 | (defmethod pronoun-p ((language portuguese) symbol) 29 | (member symbol '(:SPEC :PERS))) 30 | 31 | (defun make-portuguese-db (&key 32 | profile-p 33 | save-p 34 | (user-lexicon "data/portuguese-lexicon.txt") 35 | (stop-words-file "data/portuguese-stop-words.txt") 36 | (pos-lex "data/portuguese-pos.txt") 37 | (pos-train "data/portuguese-pos.txt") 38 | (chunker-train "data/portuguese-parsed.txt")) 39 | (let ((*language* (make-new-language 'portuguese))) 40 | (setf (alphabet *language*) 41 | "âáãêéóô¡abcdefghijklmnopqrstuvwxyz0123456789-'/") 42 | (log:info "Training Portuguese NLP system...") 43 | (log:info "Building and training lexicon...") 44 | (load-stop-words *language* stop-words-file) 45 | (maybe-profile 46 | (make-lexicon *language* 47 | pos-lex 48 | :user-file user-lexicon 49 | :external-format :utf-8)) 50 | ;;(clean-lexicon *language*) 51 | (log:info "Training POS tagger...") 52 | (maybe-profile (train-tagger *language* pos-train :external-format :utf-8)) 53 | (log:info "Training HMM Chunker...") 54 | (maybe-profile (train-phrase-extractor *language* chunker-train)) 55 | (when save-p 56 | (log:info "Freezing POS database...") 57 | (maybe-profile (freeze-nlp *language*))) 58 | (add-language *language*) 59 | *language*)) 60 | -------------------------------------------------------------------------------- /pos-map.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defvar *pos-maps* (make-hash-table :test 'eql)) 4 | 5 | (defmacro def-pos-map (name pairs) 6 | `(let ((map 7 | (list 8 | ,@(mapcar (lambda (pair) 9 | `(cons 10 | ,(first pair) 11 | ,(if (symbolp (second pair)) 12 | `(intern (symbol-name ',(second pair)) :keyword) 13 | `(intern ,(second pair) :keyword)))) 14 | pairs)))) 15 | (dolist (pair map) 16 | (unless (valid-pos-tag-p (cdr pair)) 17 | (error "Invalid tags in definition of ~A: ~A" ',name pair))) 18 | (setf (gethash ,name *pos-maps*) map))) 19 | 20 | (defgeneric lookup-generic-pos (name pos &key test)) 21 | 22 | (defmethod lookup-generic-pos (name pos &key (test 'equal)) 23 | (let ((alist (gethash name *pos-maps*))) 24 | (when alist 25 | (cdr (assoc pos alist :test test))))) 26 | 27 | (defgeneric lookup-specific-pos (name pos &key test)) 28 | 29 | (defmethod lookup-generic-pos (name pos &key (test 'equal)) 30 | (let ((alist (gethash name *pos-maps*))) 31 | (when alist 32 | (let ((tag (car (assoc pos alist :test test)))) 33 | (when tag 34 | (intern (string-upcase tag) :keyword)))))) 35 | -------------------------------------------------------------------------------- /pos-symbols.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (alexandria:define-constant +punctuation-tag+ :|.|) 4 | (alexandria:define-constant +foreign-tag+ :X) 5 | (alexandria:define-constant +adjective-tag+ :ADJ) 6 | (alexandria:define-constant +conjunction-tag+ :CONJ) 7 | (alexandria:define-constant +determiner-tag+ :DET) 8 | (alexandria:define-constant +noun-tag+ :NOUN) 9 | (alexandria:define-constant +pronoun-tag+ :PRON) 10 | (alexandria:define-constant +adverb-tag+ :ADV) 11 | (alexandria:define-constant +adposition-tag+ :ADP) 12 | (alexandria:define-constant +verb-tag+ :VERB) 13 | (alexandria:define-constant +number-tag+ :NUM) 14 | (alexandria:define-constant +particle-tag+ :PRT) 15 | 16 | (defparameter *part-of-speech-tags* 17 | (list 18 | +punctuation-tag+ 19 | +foreign-tag+ 20 | +adjective-tag+ 21 | +conjunction-tag+ 22 | +determiner-tag+ 23 | +noun-tag+ 24 | +pronoun-tag+ 25 | +adverb-tag+ 26 | +adposition-tag+ 27 | +verb-tag+ 28 | +number-tag+ 29 | +particle-tag+) 30 | "Derived from https://github.com/slavpetrov/universal-pos-tags") 31 | 32 | (defun valid-pos-tag-p (tag) 33 | (member tag *part-of-speech-tags*)) 34 | 35 | (defgeneric noun-p (language symbol)) 36 | (defgeneric verb-p (language symbol)) 37 | (defgeneric adjective-p (language symbol)) 38 | (defgeneric adverb-p (language symbol)) 39 | (defgeneric pronoun-p (language symbol)) 40 | (defgeneric determiner-p (language symbol)) 41 | -------------------------------------------------------------------------------- /prob-parser.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | ;;; Keivn's experimental probabilistic parser. 4 | ;;; This works, but not well 5 | ;;; The PCP parser in pcp.lisp actually works. 6 | (defun p-find-trees-in-chart (chart words) 7 | (declare (ignore words)) 8 | (mapcar 'edge->tree 9 | (sort 10 | (remove-if-not (lambda (e) 11 | (and (complete? e) 12 | (= (left-vertex e) 0) 13 | (eq :start (label e)))) 14 | (elt (chart-edge-vec chart) 15 | (1- (length (chart-edge-vec chart))))) 16 | '> 17 | :key 'probability))) 18 | 19 | (defun p-bottom-up-rule (chart edge) 20 | (let* ((pos (label edge)) 21 | (productions (lookup-pos-productions pos))) 22 | (dolist (s productions) 23 | (let ((p-hash (gethash s (pcfg *language*)))) 24 | (maphash (lambda (production p) 25 | (if (listp production) 26 | (when (eq (first production) pos) 27 | (add-edge-to-chart 28 | chart (make-chart-edge 29 | :left-vertex (left-vertex edge) 30 | :right-vertex (left-vertex edge) 31 | :label s 32 | :to-find production 33 | :found nil 34 | :probability p))) 35 | (when (eq production pos) 36 | (add-edge-to-chart 37 | chart (make-chart-edge 38 | :left-vertex (left-vertex edge) 39 | :right-vertex (left-vertex edge) 40 | :label s 41 | :to-find (list production) 42 | :found nil 43 | :probability p))))) 44 | p-hash))) 45 | (setf (elt (chart-edge-vec chart) (left-vertex edge)) 46 | (sort (elt (chart-edge-vec chart) (left-vertex edge)) 47 | '> :key 'probability)))) 48 | 49 | (defun p-fundamental-rule (chart child-edge) 50 | (dolist (edge (elt (chart-edge-vec chart) (left-vertex child-edge))) 51 | (when (and (eq (label child-edge) (first (to-find edge))) 52 | (>= (left-vertex child-edge) 53 | (left-vertex edge)) 54 | (/= (right-vertex child-edge) 55 | (right-vertex edge))) 56 | (let ((new-edge (make-chart-edge 57 | :left-vertex (left-vertex edge) 58 | :right-vertex (right-vertex child-edge) 59 | :label (label edge) 60 | :to-find (rest (to-find edge)) 61 | :found (cons child-edge (found edge))))) 62 | (setf (probability new-edge) 63 | (* (probability edge) 64 | (reduce '* (found edge) :key 'probability))) 65 | (when (add-edge-to-chart chart new-edge) 66 | (setf (elt (chart-edge-vec chart) (right-vertex new-edge)) 67 | (sort (elt (chart-edge-vec chart) 68 | (right-vertex new-edge)) 69 | '> :key 'probability)) 70 | (when (null (to-find new-edge)) 71 | (p-bottom-up-rule chart new-edge) 72 | (p-fundamental-rule chart new-edge))))))) 73 | 74 | (defun p-initialize-word (chart pos word p i) 75 | (let ((edge (make-chart-edge :left-vertex i 76 | :right-vertex (1+ i) 77 | :label pos 78 | :edge-word word 79 | :to-find nil 80 | :found (list word) 81 | :probability p))) 82 | (add-edge-to-chart chart edge) 83 | (p-bottom-up-rule chart edge) 84 | (p-fundamental-rule chart edge))) 85 | 86 | (defmethod p-chart-parse ((language language) text) 87 | (multiple-value-bind (pos-with-p words) 88 | (possible-tags language text :p? t) 89 | (let ((chart (make-chart :edge-vec 90 | (make-array (1+ (length words)) 91 | :initial-element nil)))) 92 | (dotimes (i (length words)) 93 | (let ((word (nth i words)) (pos-list (nth i pos-with-p))) 94 | (dolist (pos (sort pos-list '> :key 'cdr)) 95 | (unless (zerop (cdr pos)) 96 | (log:debug "Initializing ~A(~F) -> ~A" 97 | word (cdr pos) (car pos)) 98 | (p-initialize-word chart (car pos) word (cdr pos) i))))) 99 | ;;chart))) 100 | (values (p-find-trees-in-chart chart words) 101 | (mapcar (lambda (w p) 102 | (list w p)) words pos-with-p))))) 103 | -------------------------------------------------------------------------------- /prolog/prologdb.5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/prolog/prologdb.5.pdf -------------------------------------------------------------------------------- /prolog/senseidx.5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/prolog/senseidx.5.pdf -------------------------------------------------------------------------------- /prolog/wn_cs.pl: -------------------------------------------------------------------------------- 1 | cs(200019273,200014742). 2 | cs(200020449,200020259). 3 | cs(200025203,200026153). 4 | cs(200025654,200026385). 5 | cs(200053656,200053889). 6 | cs(200055871,200055539). 7 | cs(200064643,202122164). 8 | cs(200073813,200073343). 9 | cs(200075021,200076114). 10 | cs(200077950,200002724). 11 | cs(200098083,200024047). 12 | cs(200100551,200099721). 13 | cs(200126264,200109660). 14 | cs(200144169,200141524). 15 | cs(200202569,200202784). 16 | cs(200205885,200205046). 17 | cs(200218475,200219403). 18 | cs(200220869,200220461). 19 | cs(200224901,200223500). 20 | cs(200227165,200226566). 21 | cs(200238542,200238372). 22 | cs(200241038,200240571). 23 | cs(200245457,200231557). 24 | cs(200249679,200248026). 25 | cs(200249969,200249852). 26 | cs(200253761,200252019). 27 | cs(200253761,200230746). 28 | cs(200254672,200254298). 29 | cs(200255389,200254867). 30 | cs(200256369,200263231). 31 | cs(200257650,200256507). 32 | cs(200258528,200257947). 33 | cs(200266586,200266197). 34 | cs(200270005,201831531). 35 | cs(200282076,200281101). 36 | cs(200300537,202659763). 37 | cs(200303661,200303465). 38 | cs(200306723,200306298). 39 | cs(200311559,200312380). 40 | cs(200312990,200311113). 41 | cs(200314272,200314782). 42 | cs(200316768,200316494). 43 | cs(200317569,200317468). 44 | cs(200317700,200316631). 45 | cs(200318816,200240810). 46 | cs(200348746,202608347). 47 | cs(200352826,202609764). 48 | cs(200361952,200361797). 49 | cs(200364629,200364868). 50 | cs(200370412,200369864). 51 | cs(200371264,200372665). 52 | cs(200372501,200370263). 53 | cs(200374668,200375021). 54 | cs(200375865,200374135). 55 | cs(200385385,200384411). 56 | cs(200385685,200384411). 57 | cs(200389406,200389238). 58 | cs(200392426,200392314). 59 | cs(200416705,200416880). 60 | cs(200417596,200418563). 61 | cs(200419375,200419137). 62 | cs(200421125,200421408). 63 | cs(200430625,200430999). 64 | cs(200431327,200431117). 65 | cs(200443384,200443116). 66 | cs(200445467,200445169). 67 | cs(200445940,200443670). 68 | cs(200446695,200446514). 69 | cs(200447309,200446329). 70 | cs(200449692,200448680). 71 | cs(200452512,200451838). 72 | cs(200454868,202066939). 73 | cs(200458754,200458471). 74 | cs(200459296,200459498). 75 | cs(200461493,200461354). 76 | cs(200469187,200469030). 77 | cs(200500834,200500638). 78 | cs(200501159,200501048). 79 | cs(200519229,200519056). 80 | cs(200538454,200538323). 81 | cs(200539936,200540101). 82 | cs(200550823,200550546). 83 | cs(200564300,200564151). 84 | cs(200591755,200588888). 85 | cs(200592037,200588888). 86 | cs(200601378,200601043). 87 | cs(200610538,200607780). 88 | cs(200699485,200697589). 89 | cs(200829107,200597915). 90 | cs(200830498,200598502). 91 | cs(200834943,200834745). 92 | cs(200851239,202367363). 93 | cs(200859325,200859153). 94 | cs(200859937,200032297). 95 | cs(200933821,200935987). 96 | cs(200937023,200935987). 97 | cs(200968211,200969873). 98 | cs(200973056,200973728). 99 | cs(201113620,201111816). 100 | cs(201133825,201134238). 101 | cs(201154175,201848465). 102 | cs(201158872,202676789). 103 | cs(201169589,201170052). 104 | cs(201178565,201168468). 105 | cs(201187740,201188144). 106 | cs(201200440,201200934). 107 | cs(201258302,201970826). 108 | cs(201294396,201294182). 109 | cs(201295275,201291069). 110 | cs(201296462,201290422). 111 | cs(201323958,200358431). 112 | cs(201340439,201343892). 113 | cs(201344293,201344140). 114 | cs(201345109,201346978). 115 | cs(201346003,201346804). 116 | cs(201356370,201356750). 117 | cs(201369758,201369346). 118 | cs(201370126,202041877). 119 | cs(201458228,201457954). 120 | cs(201481360,201482075). 121 | cs(201543998,201543123). 122 | cs(201544285,201543123). 123 | cs(201544692,201547001). 124 | cs(201546768,201546111). 125 | cs(201556921,201557774). 126 | cs(201562209,201562061). 127 | cs(201602665,201602318). 128 | cs(201642924,200339934). 129 | cs(201643657,200339934). 130 | cs(201652895,201652731). 131 | cs(201765908,201767163). 132 | cs(201767949,201771535). 133 | cs(201773535,201773346). 134 | cs(201779165,201780202). 135 | cs(201781983,201782218). 136 | cs(201785971,201787106). 137 | cs(201786419,201795428). 138 | cs(201789164,201767461). 139 | cs(201789270,201773346). 140 | cs(201790383,201791973). 141 | cs(201792567,201794668). 142 | cs(201794523,201794363). 143 | cs(201795888,201796033). 144 | cs(201797582,201797347). 145 | cs(201811736,201813884). 146 | cs(201813053,201813393). 147 | cs(201813499,201812950). 148 | cs(201814074,200860136). 149 | cs(201815628,201777210). 150 | cs(201815628,201776952). 151 | cs(201817130,201776727). 152 | cs(201818235,201811441). 153 | cs(201835103,201834896). 154 | cs(201843055,201835496). 155 | cs(201848058,201902783). 156 | cs(201850315,201831531). 157 | cs(201857717,201864230). 158 | cs(201859221,201860795). 159 | cs(201874424,201874320). 160 | cs(201879777,201879251). 161 | cs(201900760,201900408). 162 | cs(201909978,201909397). 163 | cs(201955009,202009433). 164 | cs(201965156,201963942). 165 | cs(201973125,201970826). 166 | cs(201973759,201968569). 167 | cs(201974062,201968569). 168 | cs(201975587,201983134). 169 | cs(201981036,201979901). 170 | cs(201986869,201989873). 171 | cs(201989562,201989053). 172 | cs(201997512,201997119). 173 | cs(202014024,201855606). 174 | cs(202022486,202022359). 175 | cs(202042843,202042404). 176 | cs(202045790,202045043). 177 | cs(202052476,202050132). 178 | cs(202067689,202066939). 179 | cs(202069551,202066939). 180 | cs(202071837,202071627). 181 | cs(202076501,202076280). 182 | cs(202084380,201919391). 183 | cs(202086805,202056300). 184 | cs(202086963,202064131). 185 | cs(202110927,202110220). 186 | cs(202115778,202106006). 187 | cs(202125641,202123672). 188 | cs(202137132,202129289). 189 | cs(202138075,200422090). 190 | cs(202157100,200422090). 191 | cs(202166761,202166460). 192 | cs(202179518,202176268). 193 | cs(202181538,202176268). 194 | cs(202186690,202186506). 195 | cs(202191766,202194286). 196 | cs(202199590,202203362). 197 | cs(202220461,202221959). 198 | cs(202254767,202254923). 199 | cs(202264179,202264397). 200 | cs(202380251,202379753). 201 | cs(202380571,202379753). 202 | cs(202404904,201831531). 203 | cs(202407987,202410855). 204 | cs(202429810,202428924). 205 | cs(202435867,202434238). 206 | cs(202439501,202367363). 207 | cs(202504562,202367363). 208 | cs(202506546,202367363). 209 | cs(202523953,202525044). 210 | cs(202539788,202540347). 211 | cs(202594469,202594290). 212 | cs(202598143,202428924). 213 | cs(202660290,202659763). 214 | cs(202681639,202681524). 215 | cs(202712914,202713184). 216 | cs(202720697,202720544). 217 | cs(202753642,202753426). 218 | cs(202759614,200377002). 219 | cs(202762468,200377002). 220 | cs(202763283,202763740). 221 | -------------------------------------------------------------------------------- /prolog/wn_ppl.pl: -------------------------------------------------------------------------------- 1 | ppl(303147281,1,201153486,2). 2 | ppl(303147408,1,201153486,2). 3 | ppl(303147543,1,201624897,1). 4 | ppl(303147643,1,201589497,1). 5 | ppl(303147643,2,201589497,1). 6 | ppl(303147793,1,201959340,1). 7 | ppl(303147919,1,201380638,3). 8 | ppl(303147919,2,201380638,1). 9 | ppl(303148137,1,201380638,3). 10 | ppl(303148137,2,201380638,1). 11 | ppl(303148333,1,200869596,1). 12 | ppl(303148487,1,200869596,1). 13 | ppl(303148653,1,202352824,1). 14 | ppl(303148724,1,202072849,1). 15 | ppl(303148831,1,201871979,2). 16 | ppl(303149025,1,201675245,2). 17 | ppl(303149169,1,201216670,1). 18 | ppl(303149169,2,201216670,1). 19 | ppl(303149347,1,202441897,1). 20 | ppl(303149478,1,200069012,1). 21 | ppl(303149619,1,202694548,1). 22 | ppl(303149732,1,201183573,3). 23 | ppl(303149843,1,201183573,3). 24 | ppl(303149960,1,202659763,1). 25 | ppl(303150124,1,200181005,1). 26 | ppl(303150124,2,200181005,2). 27 | ppl(303150301,1,200117757,1). 28 | ppl(303150431,1,200117757,1). 29 | ppl(303150569,1,201996735,1). 30 | ppl(303150670,1,200185698,1). 31 | ppl(303150670,2,200185698,1). 32 | ppl(303150897,1,201660547,1). 33 | ppl(303150985,1,201525666,3). 34 | ppl(303151122,1,200238867,1). 35 | ppl(303151122,2,200238867,2). 36 | ppl(303151302,1,201493380,1). 37 | ppl(303151423,1,200364629,1). 38 | ppl(303151423,2,200364629,2). 39 | ppl(303151582,1,200364629,1). 40 | ppl(303151582,2,200364629,2). 41 | ppl(303151711,1,201688604,1). 42 | ppl(303151711,2,201688604,1). 43 | ppl(303151871,1,200298420,1). 44 | ppl(303152015,1,201072949,1). 45 | ppl(303152135,1,200123170,1). 46 | ppl(303152249,1,202142775,2). 47 | ppl(303152350,1,202142775,2). 48 | ppl(303152480,1,200991683,1). 49 | ppl(303152577,1,201633343,2). 50 | ppl(303152759,1,202499629,1). 51 | ppl(303152874,1,202000868,1). 52 | ppl(303153018,1,201881180,10). 53 | ppl(303153155,1,201619354,1). 54 | ppl(303153255,1,200167535,1). 55 | ppl(303153361,1,200144446,1). 56 | ppl(303153554,1,200554468,1). 57 | ppl(303153554,2,200554468,2). 58 | ppl(303153653,1,200914420,1). 59 | ppl(303153755,1,201661096,1). 60 | ppl(303153849,1,201853882,1). 61 | ppl(303153960,1,200215661,1). 62 | ppl(303154143,1,201593937,1). 63 | ppl(303154289,1,201503404,1). 64 | ppl(303154362,1,201359432,1). 65 | ppl(303154463,1,200365810,1). 66 | ppl(303154463,2,200365810,2). 67 | ppl(303154649,1,200497391,1). 68 | ppl(303154649,2,200030142,2). 69 | ppl(303154786,1,202066757,1). 70 | ppl(303154886,1,201315613,2). 71 | ppl(303154986,1,202323870,1). 72 | ppl(303155193,1,200538571,1). 73 | ppl(303155306,1,200538571,1). 74 | -------------------------------------------------------------------------------- /prolog/wndb.5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/prolog/wndb.5.pdf -------------------------------------------------------------------------------- /prolog/wngroups.7: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" $Id$ 3 | .tr ~ 4 | .TH WNGROUPS 7WN "Dec 2006" "WordNet 3.0" "WordNet\(tm" 5 | .SH NAME 6 | wngroups \- discussion of WordNet search code to group similar verb senses 7 | .SH DESCRIPTION 8 | Some similar senses of verbs have been grouped by the lexicographers. 9 | This grouping is done statically in the lexicographer source files 10 | using the semantic \fIpointer_symbol\fP \fB$\fP. 11 | Transitivity is used to combine groups of overlapping 12 | senses into the largest sense groups possible. 13 | .SH NOTES 14 | Coverage of verb groups is incomplete. 15 | .SH ENVIRONMENT VARIABLES (UNIX) 16 | .TP 20 17 | .B WNHOME 18 | Base directory for WordNet. Default is 19 | \fB/usr/local/WordNet-3.0\fP. 20 | .TP 20 21 | .B WNSEARCHDIR 22 | Directory in which the WordNet database has been installed. 23 | Default is \fBWNHOME/dict\fP. 24 | .SH REGISTRY (WINDOWS) 25 | .TP 20 26 | .B HKEY_LOCAL_MACHINE\eSOFTWARE\eWordNet\e3.0\eWNHome 27 | Base directory for WordNet. Default is 28 | \fBC:\eProgram~Files\eWordNet\e3.0\fP. 29 | .SH FILES 30 | .TP 20 31 | .B sentidx.vrb 32 | verb sense keys and sentence frame numbers 33 | .TP 20 34 | .B sents.vrb 35 | example sentence frames 36 | .SH SEE ALSO 37 | .BR wn (1WN), 38 | .BR wnb (1WN), 39 | .BR senseidx (5WN), 40 | .BR wnsearch (3WN), 41 | .BR wndb (5WN), 42 | .BR wnintro (7WN). 43 | 44 | -------------------------------------------------------------------------------- /prolog/wngroups.7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/prolog/wngroups.7.pdf -------------------------------------------------------------------------------- /prolog/wngroups.7WN.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | WNGROUPS(7WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | wngroups - discussion of WordNet search code to group similar verb 13 | senses 14 |

DESCRIPTION

15 | Some similar senses of verbs have been grouped by 16 | the lexicographers. This grouping is done statically in the lexicographer 17 | source files using the semantic pointer_symbol $ . Transitivity is used 18 | to combine groups of overlapping senses into the largest sense groups 19 | possible. 20 |

NOTES

21 | Coverage of verb groups is incomplete. 22 |

ENVIRONMENT VARIABLES 23 | (UNIX)

24 | 25 |
26 | 27 |
WNHOME
28 |
Base directory for WordNet. Default is /usr/local/WordNet-3.0 29 | .
30 | 31 |
WNSEARCHDIR
32 |
Directory in which the WordNet database has been installed. 33 | Default is WNHOME/dict .
34 |
35 | 36 |

REGISTRY (WINDOWS)

37 | 38 |
39 | 40 |
HKEY_LOCAL_MACHINE\SOFTWARE\WordNet\3.0\WNHome 41 |
42 |
Base directory for WordNet. Default is C:\Program Files\WordNet\3.0 .
43 |
44 | 45 |

FILES 46 |

47 | 48 |
49 | 50 |
sentidx.vrb
51 |
verb sense keys and sentence frame numbers
52 | 53 |
sents.vrb
54 |
example 55 | sentence frames
56 |
57 | 58 |

SEE ALSO

59 | wn(1WN) 60 | , wnb(1WN) 61 | , senseidx(5WN) 62 | , wnsearch(3WN) 63 | , 64 | wndb(5WN) 65 | , wnintro(7WN) 66 | .

67 |

68 | 69 |


70 | Table of Contents

71 |

80 | 81 | -------------------------------------------------------------------------------- /prolog/wninput.5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/prolog/wninput.5.pdf -------------------------------------------------------------------------------- /prolog/wnpkgs.7: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" $Id$ 3 | .tr ~ 4 | .TH WNPKGS 7WN "Dec 2006" "WordNet 3.0" "WordNet\(tm" 5 | .SH NAME 6 | wnpkgs \- description of various WordNet system packages 7 | .SH DESCRIPTION 8 | WordNet 3.0 is distributed in several formats and in various packages. 9 | All of the packages are available via anonymous FTP from 10 | \fBftp.cogsci.princeton.edu\fP and from the WordNet Web 11 | site at \fBhttp://wordnet.princeton.edu\fP. 12 | .SS "Packages Available Via FTP and WWW" 13 | The following WordNet packages can be downloaded using a web browser 14 | from \fBftp://ftp.cogsci.princeton.edu/wordnet/3.0\fP, or 15 | from the Web site noted above. Users can also FTP directly from 16 | \fBftp.cogsci.princeton.edu\fP, directory \fBwordnet/3.0\fP. 17 | 18 | .TS 19 | center box ; 20 | c | c | c | c 21 | lt | l | l | lt. 22 | \fBPackage\fP \fBFilename\fP \fBPlatform\fP \fBDescription\fP 23 | _ 24 | .na 25 | Database \fBWordNet-3.0.tar.gz\fP Unix/OS X T{ 26 | WordNet 3.0 database, interfaces, sense index, interface 27 | and library source code, documentation. 28 | T} 29 | Database \fBWordNet-3.0.exe\fP Windows T{ 30 | WordNet 3.0 database, interfaces, sense index, interface 31 | and library source code, documentation. 32 | T} 33 | Prolog Database \fBWNprolog-3.0.tar.gz\fP All T{ 34 | WordNet 3.0 database files in Prolog-readable format, documentation. 35 | T} 36 | Sense Map \fBWNsnsmap-3.0.tar.gz\fP All T{ 37 | Mapping of 2.1 to 3.0 senses, documentation. 38 | T} 39 | .TE 40 | 41 | .SS "Database Package" 42 | The database package is a complete installation for WordNet 3.0 users. 43 | It includes the 3.0 database files, source code for the WordNet browsers and 44 | library, and documentation. The other packages are not included \- 45 | they must be downloaded and installed separately. 46 | 47 | Note that with this version of WordNet for Unix platforms, only source 48 | code is provided. Users should carefully read the README and INSTALL 49 | files for detailed information on compiling WordNet and dependencies. 50 | 51 | .SS Prolog Database Package 52 | The WordNet 3.0 database files are available in this package in a 53 | Prolog-readable format. Documentation describing the file format is 54 | included. This package is only downloadable in compressed tar file 55 | format, although once unpackaged it can be used from Windows 56 | systems since the files are in ASCII. Many Windows utilities, such as 57 | WinZip, can deal with a 58 | compressed tar file. 59 | .SS Sense Map Package 60 | To help users automatically convert 2.1 noun and verb senses to their 61 | corresponding 3.0 senses, we provide sense mapping information in 62 | this package. This package contains files to map polysemous and 63 | monosemous words, and documentation that describes the format of these 64 | files. As with the Prolog database, this package is only downloadable 65 | in compressed tar format, but the files are also in ASCII. 66 | .SH NOTES 67 | The lexicographer files and 68 | .BR grind (1WN) 69 | program are not generally distributed. 70 | 71 | All of the packages described above may not be available at the time 72 | of release of the 3.0 database package. 73 | .SH SEE ALSO 74 | .BR wnintro (1WN), 75 | .BR wnintro (3WN), 76 | .BR wnintro (5WN), 77 | .BR wnintro (7WN). 78 | -------------------------------------------------------------------------------- /prolog/wnpkgs.7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kraison/cl-nlp/1876934ab7320cdebef1d9861be865cb59067b41/prolog/wnpkgs.7.pdf -------------------------------------------------------------------------------- /prolog/wnpkgs.7WN.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | WNPKGS(7WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | wnpkgs - description of various WordNet system packages 13 |

DESCRIPTION 14 |

15 | WordNet 3.0 is distributed in several formats and in various packages. All 16 | of the packages are available via anonymous FTP from ftp.cogsci.princeton.edu 17 | and from the WordNet Web site at http://wordnet.princeton.edu 18 | . 19 |

Packages 20 | Available Via FTP and WWW

21 | The following WordNet packages can be downloaded 22 | using a web browser from ftp://ftp.cogsci.princeton.edu/wordnet/3.0 , or from 23 | the Web site noted above. Users can also FTP directly from ftp.cogsci.princeton.edu 24 | , directory wordnet/3.0 .

25 | 26 | 28 | 30 | 33 | 34 | 36 | 38 |
Package Filename Platform Description 27 |
Database WordNet-3.0.tar.gz Unix/OS X WordNet 3.0 database, interfaces, 29 | sense index, interface and library source code, documentation.
Database 31 | WordNet-3.0.exe Windows WordNet 3.0 database, interfaces, sense index, 32 | interface and library source code, documentation.
Prolog Database WNprolog-3.0.tar.gz All WordNet 3.0 database files in Prolog-readable format, 35 | documentation.
Sense Map WNsnsmap-3.0.tar.gz All Mapping of 2.1 to 3.0 37 | senses, documentation.
39 |

40 | 41 |

Database Package

42 | The database package is a 43 | complete installation for WordNet 3.0 users. It includes the 3.0 database 44 | files, source code for the WordNet browsers and library, and documentation. 45 | The other packages are not included - they must be downloaded and installed 46 | separately.

47 | Note that with this version of WordNet for Unix platforms, 48 | only source code is provided. Users should carefully read the README and 49 | INSTALL files for detailed information on compiling WordNet and dependencies. 50 |

51 | 52 |

Prolog Database Package

53 | The WordNet 3.0 database files are available 54 | in this package in a Prolog-readable format. Documentation describing the 55 | file format is included. This package is only downloadable in compressed 56 | tar file format, although once unpackaged it can be used from Windows 57 | systems since the files are in ASCII. Many Windows utilities, such as 58 | WinZip, can deal with a compressed tar file. 59 |

Sense Map Package

60 | To help 61 | users automatically convert 2.1 noun and verb senses to their corresponding 62 | 3.0 senses, we provide sense mapping information in this package. This 63 | package contains files to map polysemous and monosemous words, and documentation 64 | that describes the format of these files. As with the Prolog database, 65 | this package is only downloadable in compressed tar format, but the files 66 | are also in ASCII. 67 |

NOTES

68 | The lexicographer files and grind(1WN) 69 | program 70 | are not generally distributed.

71 | All of the packages described above may 72 | not be available at the time of release of the 3.0 database package. 73 |

SEE 74 | ALSO

75 | wnintro(1WN) 76 | , wnintro(3WN) 77 | , wnintro(5WN) 78 | , wnintro(7WN) 79 | .

80 | 81 |


82 | Table of Contents

83 |

95 | 96 | -------------------------------------------------------------------------------- /read-cess-esp.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (def-pos-map :cess 4 | (("fa" ".") 5 | ("fc" ".") 6 | ("fd" ".") 7 | ("fe" ".") 8 | ("fg" ".") 9 | ("fh" ".") 10 | ("fi" ".") 11 | ("fp" ".") 12 | ("fs" ".") 13 | ("fx" ".") 14 | ("fz" ".") 15 | ("number" "NUM") 16 | ("zm" "NUM") 17 | ("zp" "NUM") 18 | ("ao" "ADJ") 19 | ("aq" "ADJ") 20 | ("cc" "CONJ") 21 | ("cs" "CONJ") 22 | ("da" "DET") 23 | ("dd" "DET") 24 | ("de" "DET") 25 | ("di" "DET") 26 | ("dn" "DET") 27 | ("dp" "DET") 28 | ("dt" "DET") 29 | ("nc" "NOUN") 30 | ("np" "NOUN") 31 | ("p0" "PRON") 32 | ("pd" "PRON") 33 | ("pe" "PRON") 34 | ("pi" "PRON") 35 | ("pn" "PRON") 36 | ("pp" "PRON") 37 | ("pr" "PRON") 38 | ("pt" "PRON") 39 | ("px" "PRON") 40 | ("rg" "ADV") 41 | ("rn" "ADV") 42 | ("sn" "ADP") 43 | ("sp" "ADP") 44 | ("va" "VERB") 45 | ("vm" "VERB") 46 | ("vs" "VERB") 47 | ("w" "NOUN") 48 | ("z" "NUM") 49 | ("x" "X") 50 | ("y" "X") 51 | ("i" "X"))) 52 | 53 | (defmethod lookup-generic-pos ((name (eql :cess)) pos &key &allow-other-keys) 54 | (let ((alist (gethash name *pos-maps*))) 55 | (when alist 56 | (cdr (assoc pos alist 57 | :test (lambda (key1 key2) 58 | (let ((regex (format nil "^~A" key2))) 59 | (scan regex key1))) 60 | ))))) 61 | 62 | (defmethod lookup-specific-pos ((name (eql :cess)) pos &key &allow-other-keys) 63 | (let ((alist (gethash name *pos-maps*))) 64 | (when alist 65 | (let ((tag 66 | (car (assoc pos alist 67 | :test (lambda (key1 key2) 68 | (let ((regex (create-scanner 69 | (format nil "^~A" key2) 70 | :case-insensitive-mode t))) 71 | (scan regex key1))))))) 72 | (when tag 73 | (intern (string-upcase tag) :keyword)))))) 74 | 75 | (defun parse-cess-esp-tree (tree) 76 | (let ((leaf-phrases nil)) 77 | (labels ((dfs-helper (this-node) 78 | (cond ((and (consp this-node) 79 | (atom (first this-node)) 80 | (atom (second this-node)) 81 | (atom (third this-node))) 82 | (push 83 | (cons 84 | (string-downcase (format nil "~A" (first this-node))) 85 | (string-downcase (format nil "~A" (second this-node)))) 86 | leaf-phrases)) 87 | ((and (consp this-node) 88 | (atom (first this-node)) 89 | (consp (second this-node))) 90 | (dolist (phrase (rest this-node)) 91 | (dfs-helper phrase)))))) 92 | (dfs-helper tree)) 93 | (delete-if (lambda (pair) 94 | (equal " " (cdr pair))) 95 | (nreverse leaf-phrases)))) 96 | 97 | (defun read-cess-esp (directory) 98 | (let ((sentences nil)) 99 | (dolist (file (cl-fad:list-directory directory)) 100 | (when (cl-ppcre:scan "\.sexp$" (namestring file)) 101 | (handler-case 102 | (map-sexp-corpus 103 | (lambda (tree) 104 | (let ((sentence (parse-cess-esp-tree tree))) 105 | (push (mapcar 106 | (lambda (pair) 107 | (let ((pos (lookup-specific-pos :cess (car pair)))) 108 | (unless pos 109 | (error "UNKNOWN POS IN ~A: ~S" file pair)) 110 | (cons pos (cdr pair)))) 111 | sentence) 112 | sentences))) 113 | file 114 | :collect-p nil 115 | :external-format :LATIN-1) 116 | (error (c) 117 | (log:error "ERROR READING ~A: ~A" file c))))) 118 | sentences)) 119 | -------------------------------------------------------------------------------- /read-paisa.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (def-pos-map :isst 4 | (("A" "ADJ") 5 | ("AP" "PRON") 6 | ("B" "ADV") 7 | ("BN" "ADV") 8 | ("C" "CONJ") 9 | ("CC" "CONJ") 10 | ("CS" "CONJ") 11 | ("DQ" "DET") 12 | ("DD" "DET") 13 | ("DE" "DET") 14 | ("DI" "DET") 15 | ("DR" "DET") 16 | ("DT" "DET") 17 | ("E" "ADP") 18 | ("EA" "ADP") 19 | ("I" "X") 20 | ("N" "NUM") 21 | ("NO" "NUM") 22 | ("PD" "PRON") 23 | ("PI" "PRON") 24 | ("PP" "PRON") 25 | ("PQ" "PRON") 26 | ("PR" "PRON") 27 | ("PT" "PRON") 28 | ("PU" ".") 29 | ("RD" "DET") 30 | ("RI" "DET") 31 | ("S" "NOUN") 32 | ("SA" "X") 33 | ("SP" "NOUN") 34 | ("SW" "NOUN") 35 | ("V" "VERB") 36 | ("X" "X"))) 37 | 38 | (defun convert-paisa-file (in-file out-file) 39 | "Convert a Paisa corpus file into standard word/pos format" 40 | (with-open-file (stream out-file 41 | :direction :output 42 | :if-exists :supersede 43 | :if-does-not-exist :create) 44 | (map-conll-corpus 45 | (lambda (sentence pos-seq) 46 | (dotimes (i (length sentence)) 47 | (format stream "~A/~A" (elt sentence i) (elt pos-seq i)) 48 | (if (= i (- (length sentence) 1)) 49 | (terpri stream) 50 | (format stream " ")))) 51 | in-file))) 52 | 53 | (defun gather-paisa-symbols (file) 54 | "Utility function that lists unique pos tags in a Paisa file" 55 | (let ((symbols nil)) 56 | (map-conll-corpus 57 | (lambda (sentence pos-seq) 58 | (declare (ignore sentence)) 59 | (dotimes (i (length pos-seq)) 60 | (pushnew (elt pos-seq i) symbols :test 'equal))) 61 | file) 62 | symbols)) 63 | -------------------------------------------------------------------------------- /read-tiger.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defun convert-tiger-conll-file (in-file out-file) 4 | (with-open-file (stream out-file 5 | :direction :output 6 | :if-exists :supersede 7 | :if-does-not-exist :create 8 | :element-type 'character 9 | :external-format :utf-8) 10 | (map-conll-corpus (lambda (sentence pos-seq) 11 | (dotimes (i (length sentence)) 12 | (format stream "~A/~A" (elt sentence i) (elt pos-seq i)) 13 | (if (= i (- (length sentence) 1)) 14 | (terpri stream) 15 | (format stream " ")))) 16 | in-file))) 17 | 18 | (defun find-dom-element (name dom) 19 | (find name dom 20 | :key (lambda (e) (and (consp e) (first e))) 21 | :test 'equalp)) 22 | 23 | (defun index-terminals (index terminals) 24 | (delete-if 25 | 'null 26 | (mapcar (lambda (terminal) 27 | (when (and (consp terminal) 28 | (equalp (first terminal) "t")) 29 | (let ((items (second terminal))) 30 | (setf (gethash (second (find-dom-element "id" items)) index) 31 | (list (intern (string-upcase 32 | (second (find-dom-element "pos" items))) 33 | :keyword) 34 | (second (find-dom-element "word" items))))))) 35 | terminals))) 36 | 37 | (defun index-nonterminals (index nonterminals) 38 | (delete-if 39 | 'null 40 | (mapcar 41 | (lambda (node) 42 | (when (and (consp node) (equalp (first node) "nt")) 43 | (let ((category (intern (second (first (second node))) :keyword)) 44 | (id (second (second (second node))))) 45 | (setf (gethash id index) 46 | (list category 47 | (delete-if 48 | 'null 49 | (mapcar 50 | (lambda (edge) 51 | (when (and (consp edge) 52 | (equalp "edge" (first edge))) 53 | (list (intern (second (second (second edge))) :keyword) 54 | (second (first (second edge)))))) 55 | node))))))) 56 | nonterminals))) 57 | 58 | (defun convert-tiger-xml-file (in-file out-file) 59 | (let* ((dom (cxml:parse-file in-file (cxml-xmls:make-xmls-builder))) 60 | (body (find-dom-element "body" dom))) 61 | (with-open-file (stream out-file 62 | :direction :output 63 | :if-exists :supersede 64 | :if-does-not-exist :create 65 | :element-type 'character 66 | :external-format :utf-8) 67 | (dolist (node body) 68 | (when (and (consp node) (equalp (first node) "s")) 69 | (let ((graph (find-dom-element "graph" node)) 70 | (index (make-hash-table :test 'equalp))) 71 | (let ((terminals 72 | (index-terminals index 73 | (find-dom-element "terminals" graph))) 74 | (nonterminals 75 | (index-nonterminals index 76 | (find-dom-element "nonterminals" graph)))) 77 | (let ((used-nodes nil)) 78 | (dolist (nt nonterminals) 79 | (loop for i from 0 below (length (second nt)) do 80 | (let ((id (second (nth i (second nt))))) 81 | (when id 82 | (let ((sub-node (gethash id index))) 83 | (push sub-node used-nodes) 84 | (setf (second (nth i (second nt))) sub-node)))))) 85 | (let ((tree 86 | (set-difference (append terminals nonterminals) 87 | used-nodes 88 | :test 'equalp))) 89 | (when tree 90 | (write tree :stream stream) 91 | (terpri stream))))))))))) 92 | 93 | (defun flatten-tiger-phrase-tree (tree) 94 | (let ((leaf-phrases nil)) 95 | (labels ((dfs-helper (this-node phrase-type) 96 | (cond ((and (consp this-node) 97 | (eql :NP (first this-node)) 98 | (= 2 (length this-node)) 99 | (every (lambda (child) 100 | (and (consp child) 101 | (= 2 (length child)) 102 | (symbolp (first child)) 103 | (consp (second child)) 104 | (every 'atom (second child)))) 105 | (second this-node))) 106 | (dolist (child (second this-node)) 107 | (push (list :NP (first (second child))) leaf-phrases))) 108 | ((and (consp this-node) 109 | (= 2 (length this-node)) 110 | (atom (first this-node)) 111 | (atom (second this-node))) 112 | (push (list phrase-type (first this-node)) leaf-phrases)) 113 | ((and (consp this-node) 114 | (atom (first this-node)) 115 | (every 'consp (rest this-node)) 116 | (every (lambda (c) (= 2 (length c))) (rest this-node))) 117 | (dolist (phrase (rest this-node)) 118 | (dfs-helper phrase (first this-node)))) 119 | ((consp this-node) 120 | (dolist (child this-node) 121 | (dfs-helper child (first this-node))))))) 122 | (dfs-helper tree nil)) 123 | (nreverse leaf-phrases))) 124 | -------------------------------------------------------------------------------- /read-tut.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (defun extract-tut-symbols (file) 4 | "Utility function that lists unique pos tags in a TUT grammar tree file" 5 | (let ((symbols nil)) 6 | (map-sexp-corpus 7 | (lambda (tree) 8 | (map nil 9 | (lambda (pair) 10 | (pushnew (second pair) symbols)) 11 | (flatten-phrase-tree tree))) 12 | file) 13 | (sort symbols 'string-lessp :key 'symbol-name))) 14 | 15 | (defun translate-tut-file (in-file out-file) 16 | "TUT distributes a Penn-style grammar tree, but uses idiomatic tags; this 17 | function translates those into tags in Paisa equivalents" 18 | (with-open-file (out out-file 19 | :direction :output 20 | :if-exists :supersede 21 | :if-does-not-exist :create 22 | :element-type 'character 23 | :external-format :utf-8) 24 | (map-sexp-corpus 25 | (lambda (tree) 26 | (write 27 | (list 28 | (nsublis '(("^ADJ~PO" . :AP) 29 | ("^ADJ" . :A) 30 | ("^ADVB" . :B) 31 | ("^ART-IN" . :RI) 32 | ("^ART" . :RD) 33 | ("^CONJ" . :CC) 34 | ("^DATE" . :N) 35 | ("^NOU-PR" . :SP) 36 | ("^NOU" . :S) 37 | ("^NUMR" . :N) 38 | ("^PHRAS" . :B) 39 | ("^PRDT" . :T) 40 | ("^PREP" . :E) 41 | ("^PRO~DE" . :PD) 42 | ("^PRO~ID" . :PI) 43 | ("^PRO~IN" . :PQ) 44 | ("^PRO~LO" . :PC) 45 | ("^PRO~OR" . :PQ) 46 | ("^PRO~PE" . :PE) 47 | ("^PRO~PO" . :PP) 48 | ("^PRO~RE" . :PR) 49 | ("^PRO~RI" . :PC) 50 | ("^PUNCT" . :FF) 51 | ("^SPECIAL" . :X) 52 | ("^VAU" . :VA) 53 | ("^VMA" . :V) 54 | ("^VMO" . :VM)) 55 | tree 56 | :test (lambda (tag pattern) 57 | (when (atom tag) 58 | (scan pattern (format nil "~A" tag)))))) 59 | :stream out 60 | :readably t) 61 | (terpri out)) 62 | in-file))) 63 | -------------------------------------------------------------------------------- /read-wikicorpus-es.lisp: -------------------------------------------------------------------------------- 1 | (in-package :nlp) 2 | 3 | (def-pos-map :wiki-es 4 | (("number" "NUM") 5 | ("frc" ".") 6 | ("flt" ".") 7 | ("fla" ".") 8 | ("fra" ".") 9 | ("ft" ".") 10 | ("fa" ".") 11 | ("fc" ".") 12 | ("fd" ".") 13 | ("fe" ".") 14 | ("fg" ".") 15 | ("fh" ".") 16 | ("fi" ".") 17 | ("fp" ".") 18 | ("fs" ".") 19 | ("fx" ".") 20 | ("fz" ".") 21 | ("zm" "NUM") 22 | ("zp" "NUM") 23 | ("ao" "ADJ") 24 | ("aq" "ADJ") 25 | ("cc" "CONJ") 26 | ("cs" "CONJ") 27 | ("da" "DET") 28 | ("dd" "DET") 29 | ("de" "DET") 30 | ("di" "DET") 31 | ("dn" "DET") 32 | ("dp" "DET") 33 | ("dt" "DET") 34 | ("nc" "NOUN") 35 | ("np" "NOUN") 36 | ("p0" "PRON") 37 | ("pd" "PRON") 38 | ("pe" "PRON") 39 | ("pi" "PRON") 40 | ("pn" "PRON") 41 | ("pp" "PRON") 42 | ("pr" "PRON") 43 | ("pt" "PRON") 44 | ("px" "PRON") 45 | ("rg" "ADV") 46 | ("rn" "ADV") 47 | ("sn" "ADP") 48 | ("sp" "ADP") 49 | ("va" "VERB") 50 | ("vm" "VERB") 51 | ("vs" "VERB") 52 | ("w" "NOUN") 53 | ("z" "NUM") 54 | ("x" "X") 55 | ("y" "X") 56 | ("i" "X"))) 57 | 58 | (defmethod lookup-generic-pos ((name (eql :wiki-es)) pos &key &allow-other-keys) 59 | (let ((alist (gethash name *pos-maps*))) 60 | (when alist 61 | (cdr (assoc pos alist 62 | :test (lambda (key1 key2) 63 | (let ((regex (cl-ppcre:create-scanner 64 | (format nil "^~A" key2) 65 | :case-insensitive-mode t))) 66 | (scan regex key1)))))))) 67 | 68 | 69 | (defmethod lookup-specific-pos ((name (eql :wiki-es)) pos &key &allow-other-keys) 70 | (let ((alist (gethash name *pos-maps*))) 71 | (when alist 72 | (let ((tag 73 | (car (assoc pos alist 74 | :test (lambda (key1 key2) 75 | (let ((regex (cl-ppcre:create-scanner 76 | (format nil "^~A" key2) 77 | :case-insensitive-mode t))) 78 | (scan regex key1))))))) 79 | (when tag 80 | (intern (string-upcase tag) :keyword)))))) 81 | 82 | (defun read-wiki-es (directory) 83 | (let ((sentences nil)) 84 | (dolist (file (cl-fad:list-directory directory)) 85 | (let ((sentence nil)) 86 | (with-open-file (in file 87 | :element-type 'character 88 | :external-format :latin-1) 89 | (do ((line (read-line in nil :eof) (read-line in nil :eof))) 90 | ((eql line :eof)) 91 | (cond ((scan "^\<\/doc\>" line) 92 | (when sentence 93 | (push (nreverse sentence) sentences) 94 | (setq sentence nil)) 95 | ;;(return-from read-wiki-es sentences) 96 | ) 97 | ((scan "^\" line) 125 | (when sentence 126 | (setq sentence (nreverse sentence)) 127 | (dotimes (i (length sentence)) 128 | (format stream "~A/~A" 129 | (cdr (nth i sentence)) 130 | (car (nth i sentence))) 131 | (unless (= i (1- (length sentence))) 132 | (format stream " "))) 133 | (terpri stream) 134 | (setq sentence nil)) 135 | ;;(return-from read-wiki-es sentences) 136 | ) 137 | ((scan "^\ length 2) 68 | (my-union 69 | (cons 70 | (union (first ll) (second ll) :test test :key key) 71 | (cddr ll)))))))) 72 | (my-union (sort list-of-lists #'> :key #'length)))) 73 | 74 | ;;; Thanks, Mr. Norvig for this queueing code 75 | (defun print-queue (q stream depth) 76 | (declare (ignore depth)) 77 | (format stream "" (queue-elements q))) 78 | 79 | (defstruct (queue 80 | (:print-function print-queue)) 81 | (key #'identity) 82 | (last nil) 83 | (elements nil)) 84 | 85 | (defun make-empty-queue () (make-queue)) 86 | 87 | (defun empty-queue-p (q) 88 | (= (length (queue-elements q)) 0)) 89 | 90 | (defun queue-front (q) 91 | (elt (queue-elements q) 0)) 92 | 93 | (defun dequeue (q) 94 | (when (listp (queue-elements q)) 95 | (pop (queue-elements q)))) 96 | 97 | (defun enqueue (q items) 98 | (cond ((null items) nil) 99 | ((or (null (queue-last q)) (null (queue-elements q))) 100 | (setf (queue-last q) (last items) 101 | (queue-elements q) (nconc (queue-elements q) items))) 102 | (t (setf (cdr (queue-last q)) items 103 | (queue-last q) (last items))))) 104 | 105 | (defun queue-length (q) 106 | (length (queue-elements q))) 107 | 108 | (defun find-anywhere (item tree) 109 | "Does item occur anywhere in tree?" 110 | (if (atom tree) 111 | (if (eql item tree) tree) 112 | (or (find-anywhere item (first tree)) 113 | (find-anywhere item (rest tree))))) 114 | 115 | (defun last1 (list) 116 | "Return the last item in a list" 117 | (car (last list))) 118 | 119 | (defun second-to-last (list) 120 | "Return the second to last item in a list" 121 | (let ((l (length list))) 122 | (when (> l 1) 123 | (car (subseq list (- l 2) (1- l)))))) 124 | 125 | (defun flatten (lis) 126 | "Flatten a tree into a list" 127 | (cond ((atom lis) lis) 128 | ((listp (car lis)) 129 | (append (flatten (car lis)) (flatten (cdr lis)))) 130 | (t (append (list (car lis)) (flatten (cdr lis)))))) 131 | 132 | (defun init-or-increment (hash thing) 133 | "Increment or add a value for this hash key" 134 | (incf (gethash thing hash 0))) 135 | 136 | (defun join (seq &optional (delimiter " ")) 137 | "Join a seq into a space-delimited string" 138 | (with-output-to-string (out) 139 | (dotimes (i (length seq)) 140 | (format out "~A" (elt seq i)) 141 | (unless (= i (1- (length seq))) 142 | (format out "~A" delimiter))))) 143 | 144 | (defun quit (&optional code) 145 | ;; This group from "clocc-port/ext.lisp" 146 | #+allegro (excl:exit code) 147 | #+clisp (#+lisp=cl ext:quit #-lisp=cl lisp:quit code) 148 | #+cmu (ext:quit code) 149 | #+cormanlisp (win32:exitprocess code) 150 | #+gcl (lisp:bye code) ; XXX Or is it LISP::QUIT? 151 | #+lispworks (lw:quit :status code) 152 | #+lucid (lcl:quit code) 153 | #+sbcl (sb-ext:exit :code code) 154 | ;; This group from Maxima 155 | #+kcl (lisp::bye) ; XXX Does this take an arg? 156 | #+scl (ext:quit code) ; XXX Pretty sure this *does*. 157 | #+(or openmcl mcl) (ccl::quit) 158 | #+abcl (cl-user::quit) 159 | #+ecl (si:quit) 160 | ;; This group from 161 | #+poplog (poplog::bye) ; XXX Does this take an arg? 162 | #-(or allegro clisp cmu cormanlisp gcl lispworks lucid sbcl 163 | kcl scl openmcl mcl abcl ecl) 164 | (error 'not-implemented :proc (list 'quit code))) 165 | -------------------------------------------------------------------------------- /wiki-es.lisp: -------------------------------------------------------------------------------- 1 | (ql:quickload :nlp) 2 | 3 | (in-package :nlp) 4 | 5 | (flatten-wiki-es) 6 | --------------------------------------------------------------------------------