├── LICENSE
├── README.md
├── TODO
├── config.sexp
├── data
    ├── BIGRAMS.txt
    ├── CONTEXTUALRULEFILE-BROWN.txt
    ├── CONTEXTUALRULEFILE-WSJ-NOLEX.txt
    ├── CONTEXTUALRULEFILE-WSJ.txt
    ├── LEXICALRULEFILE-BROWN.txt
    ├── LEXICALRULEFILE-WSJ.txt
    ├── LEXICON-BROWN-AND-WSJ.txt
    ├── NBEST-RULES.txt
    ├── TESTRULEFILE-IAN.txt
    ├── concise-stopwords.txt
    ├── morph-truncated-fixed.txt
    ├── stem-dict.txt
    ├── stopwords.txt
    └── suffix-prob.txt
├── docs
    └── LISP2005-langutils.pdf
├── langutils.asd
└── src
    ├── chunker-constants.lisp
    ├── chunker.lisp
    ├── concept.lisp
    ├── config.lisp
    ├── contextual-rule-parser.lisp
    ├── example.lisp
    ├── init.lisp
    ├── lemma.lisp
    ├── lexicon.lisp
    ├── my-meta.lisp
    ├── package.lisp
    ├── porter.lisp
    ├── reference.lisp
    ├── regex-tokenize.lisp
    ├── stopwords.lisp
    ├── tagger-data.lisp
    ├── tagger-oldctx.lisp
    ├── tagger.lisp
    ├── tokenize.lisp
    ├── tokens.lisp
    ├── vector-keyed-table.lisp
    └── vectors.lisp


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011, Ian Eslick
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are
 7 | met:
 8 | 
 9 | 	Redistributions of source code must retain the above copyright
10 | 	notice, this list of conditions and the following disclaimer.
11 | 
12 | 	Redistributions in binary form must reproduce the above
13 | 	copyright notice, this list of conditions and the following
14 | 	disclaimer in the documentation and/or other materials
15 | 	provided with the distribution.
16 | 
17 | 	Neither Ian Eslick nor the names of any contributors may be
18 | 	used to endorse or promote products derived from this software
19 | 	without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | LANGUTILS LIBRARY
  2 | =================
  3 | 
  4 | This file contains a simple guide to the main functions and files of
  5 | the langutils library.  The code is reasonably documented with doc
  6 | strings and inline comments.  Write to the author if there are any
  7 | questions.  Also read [docs/LISP2005-langutils.pdf](http://github.com/eslick/langutils/blob/master/docs/LISP2005-langutils.pdf?raw=true)
  8 | which is a more involved exposition of the implementation and
  9 | performance issues in the toolkit.
 10 | 
 11 | The library provides a heirarchy of major functions and auxiliary
 12 | functions related to the structured analysis and processing of 
 13 | open text.  The major functions working from raw text up are:
 14 | 
 15 | - String tokenization (string -> string)
 16 | - Part of speech tagging (string -> tokens -> vector-document)
 17 | - Phrase chunking (vector-document -> phrases)
 18 | 
 19 | We also provide auxiliary functions that operate on strings, tokens or
 20 | vector-documents.  The lisp functions implementing the functionality
 21 | can be found under the appropriately labled section in the reference
 22 | below.
 23 | 
 24 | ## Strings
 25 | 
 26 | - Tokenize a string (separate punctuation from word tokens)
 27 | - POS tag a string or file returning a file, string or vector-document
 28 | - Identify suspicious strings that may become tokens
 29 | 
 30 | ## Tokens
 31 | 
 32 | - String to token-id conversion routines
 33 | - Save/Load token maps
 34 | - Guess the POS tag for a token (lexicon-based, also includes the porter stemmer)
 35 | - Identify suspicious tokens
 36 | - Identify stopwords; words used primarily as syntactic combinators
 37 | - Lookup words in the lexicon
 38 | - Get possible parts of speech for known words
 39 | - Lemmatize a token (find the root lemma for a given surface form)
 40 | - Generate all surface forms of a root word
 41 | 
 42 | ## Vector-Documents:
 43 | 
 44 | - Generate phrases using the regex chunker
 45 | 
 46 | ## Miscellaneous:
 47 | 
 48 | - Concept representation: A simple lemmatized noun or verb phrases can
 49 |   be treated as equal abstract notions; provides a CLOS class wrapper.
 50 | 
 51 | 
 52 | 
 53 | INTERFACE REFERENCE
 54 | ===================
 55 | 
 56 | This documents the important functions of the langutils toolkit.
 57 | Documentation entries are of the form:
 58 | 
 59 | ----------------------------------------------------------------------
 60 | function( args )
 61 | ----------------------------------------------------------------------
 62 | Input:
 63 | arg1 - description
 64 | arg2 - description
 65 | 
 66 | Output:
 67 | description
 68 | 
 69 | Notes:
 70 | discussion of use cases, etc.
 71 | 
 72 | Functions are explicitely referenced by putting () around them; variables or
 73 | parameters have the form of *<name>*. 
 74 | 
 75 | 
 76 | TOKENS and TOKENIZATION
 77 | 
 78 | ----------------------------------------------------------------------
 79 | tokenize-stream (stream &key (by-sentence nil) (fragment ""))
 80 | --------------------------------------------------------------------
 81 | Input:
 82 | stream - A standard lisp stream containing the characters to analyze,
 83 |          the stream can be of any length
 84 | by-sentence - Stop the tokenization process after each processed sentence
 85 |          meaning each validly parsed period, exclamation or question mark.
 86 | fragment - Provide a fragment from a prior call to tokenize stream at the
 87 |          beginning of the parse stream.
 88 | 
 89 | Output: (multiple-values)
 90 | 1 - parsing success (t) or failure (nil)
 91 | 2 - the current index into the stream, starts from 0 on every call
 92 | 3 - a string containing the tokenized data parsed up to 'index'
 93 | 4 - if parsing was a success, provides a fragment of any unparsed
 94 |     data (primarily in by-sentence mode)
 95 | 
 96 | Notes:
 97 |  This function is intended to be called all at once or in batches.
 98 |  For large strings or files it should be called in by-sentence mode
 99 |  in a loop that captures any fragments and passes them to the next call.
100 |  The function operates by grabbing one character at a time from the stream
101 |  and writing it into a temporary array.  When it reaches a punctuation
102 |  character, it inserts a whitespace then backs up to the beginning of the current 
103 |  token and checks whether the token should have included the punctuation
104 |  and fixes up the temporary array.  Upon completion of the current parse (end 
105 |  of stream or end of sentence) it 
106 | 
107 | 
108 | ----------------------------------------------------------------------
109 | tokenize-string (string)
110 | ----------------------------------------------------------------------
111 | Input:
112 | 
113 | - string - a string of English natural language text
114 | 
115 | Output: (string)
116 | 
117 | Returns a string which is the result of calling (tokenize-stream) on
118 | the stream version of the input string.
119 | 
120 | 
121 | ----------------------------------------------------------------------
122 | tokenize-file (source target &key (if-exists :supersede))
123 | ----------------------------------------------------------------------
124 | Input:
125 | 
126 | - source - The source file name as a string or pathname
127 | - target - The target file name as a string or pathname
128 | 
129 | 
130 | ----------------------------------------------------------------------
131 | id-for-token ( token )
132 | ----------------------------------------------------------------------
133 | Input:
134 | 
135 | - token - A string representing a primitive token
136 | 
137 | Output:
138 | A fixnum providing a unique id for the provided string token.  
139 | 
140 | Notes:
141 | Tokens are case sensitive so several 'The', 'the' and 'THE' all 
142 | map to different tokens but should map to the same entry in the 
143 | lexicon.  The root form of a lexicon word is the lower case 
144 | representation.
145 | 
146 | 
147 | ----------------------------------------------------------------------
148 | token-for-id ( id )
149 | ----------------------------------------------------------------------
150 | Input:
151 | 
152 | - id - A fixnum id
153 | 
154 | Output:
155 | The original token string.
156 | 
157 | 
158 | ----------------------------------------------------------------------
159 | tokens-for-id ( ids )
160 | ----------------------------------------------------------------------
161 | Input:
162 | 
163 | - ids - A list of fixnum ids
164 | 
165 | Output:
166 | A list of string representations of the each id
167 | 
168 | 
169 | ----------------------------------------------------------------------
170 | save-token-map ( filename )
171 | ----------------------------------------------------------------------
172 | Input:
173 | 
174 | - filename - A path or string to save token information to
175 | 
176 | Output:
177 | t on success or nil otherwise
178 | 
179 | Notes:
180 | This procedure will default to the filename in *default-token-map-file-int* 
181 | which can be set via the asdf-config parameter 'token-map'
182 | 
183 | 
184 | ----------------------------------------------------------------------
185 | load-token-map ( filename )
186 | ----------------------------------------------------------------------
187 | Input:
188 | 
189 | - filename - A path or string to save token information to
190 | 
191 | Output:
192 | t on success or nil otherwise
193 | 
194 | Notes:
195 | This procedure will default to the filename in *default-token-map-file-int* 
196 | which can be set via the asdf-config parameter 'token-map'
197 | 
198 | 
199 | ----------------------------------------------------------------------
200 | suspicious-word? ( word )
201 | ----------------------------------------------------------------------
202 | Input:
203 | 
204 | - word - A fixnum id for a word to test
205 | 
206 | Output:
207 | A boolean representing whether this word has been labelled as fishy
208 | 
209 | 
210 | ----------------------------------------------------------------------
211 | suspicious-string? ( string )
212 | ----------------------------------------------------------------------
213 | Input:
214 | 
215 | - string - Any string
216 | 
217 | Output:
218 | A boolean representing whether the word is fishy as determined by
219 | parameters set in tokens.lisp (max numbers, total length and other 
220 | characters in the token).  This is used inside id-for-token to 
221 | keep the hash for suspicious-word? up to date.
222 | 
223 | 
224 | POS TAGGING AND OPERATIONS ON TOKENS
225 | ====================================
226 | 
227 | ----------------------------------------------------------------------
228 | tag ( string )
229 | ----------------------------------------------------------------------
230 | Input:
231 | 
232 | - string - An input string to tag.  Input should be less than 100k 
233 |          characters if possible.
234 | 
235 | Output:
236 | A tagged string using the format <word>/<tag> where the tags are symbols
237 | taken from the Penn Treebank 2 tagset.  Actual slash characters will 
238 | show up as '///' meaning a slash word and slash token slash-separated!
239 | 
240 | Note:
241 | This procedure calls the tokenizer to ensure that the input string is
242 | properly tokenized in advance.
243 | 
244 | ----------------------------------------------------------------------
245 | tag-tokenized ( string )
246 | ----------------------------------------------------------------------
247 | Input:
248 | 
249 | - string - An input string to tag.  The string is assumed to be tokenized
250 |   already and should be less than 100k bytes in size
251 | 
252 | Output:
253 | A tagged string as in 'tag' above.
254 | 
255 | ----------------------------------------------------------------------
256 | vector-tag ( string )
257 | ----------------------------------------------------------------------
258 | Input:
259 | 
260 | - string - as in tag above
261 | 
262 | Output:
263 | A CLOS object of type vector-document with the token array initialized
264 | to fixnum representations of the word tokens and the tag array initialized
265 | with symbols represented the selected tags.
266 | 
267 | 
268 | ----------------------------------------------------------------------
269 | vector-tag-tokenized ( string &key end-tokens )
270 | ----------------------------------------------------------------------
271 | Input:
272 | 
273 | - string - as in tag-tokenized above
274 | - end-tokens - A list of string tokens to add to the end of the tokenization
275 |    array.  Sometimes this is useful to ensure a closing period if you are
276 |    doing tagging of structured NL data
277 | 
278 | Output:
279 | A vector-document as in vector-tag
280 | 
281 | Note:
282 | As in tag and tag-tokenized, this interface does not tokenize the input string.
283 | 
284 | ----------------------------------------------------------------------
285 | get-lexicon-entry ( word )
286 | ----------------------------------------------------------------------
287 | Input:
288 | 
289 | - word - Token id or token string
290 | 
291 | Output:
292 | A lexicon-entry structure related to the lexical characteristics of the token
293 | 
294 | Notes:
295 | The lexical-entry can be manipulated with a set of accessor
296 | functions: lexicon-entry-tag, lexicon-entry-tags, lexical-entry-id,
297 | lexical-entry-roots, lexical-entry-surface-forms, lexical-entry-case-forms,
298 | get-lexicon-default-pos.  These functions are not all exported from the library
299 | package, however.
300 | 
301 | 
302 | ----------------------------------------------------------------------
303 | initial-tag ( token )
304 | ----------------------------------------------------------------------
305 | Input:
306 | 
307 | - token - A string token
308 | 
309 | Output:
310 | A keyword symbol of the initially guessed tag (:PP :NN, etc)
311 | 
312 | Notes:
313 | Provides an initial guess based purely on lexical features and lexicon
314 | information of the provided string token.
315 | 
316 | 
317 | ----------------------------------------------------------------------
318 | read-file-as-tagged-document ( file )
319 | ----------------------------------------------------------------------
320 | Input:
321 | 
322 | - file - A string filename or path object
323 | 
324 | Output:
325 | A vector-document representing the tagged contents of file
326 | 
327 | Notes:
328 | Loads the file into a string then calls vector-tag
329 | 
330 | ----------------------------------------------------------------------
331 | read-and-tag-file ( file )
332 | ----------------------------------------------------------------------
333 | Input:
334 | 
335 | - file - A path string or a path object
336 | 
337 | Output:
338 | A string with tag annotations of the contents of file
339 | 
340 | Notes:
341 | Uses tag on the string contents of file
342 | 
343 | 
344 | ----------------------------------------------------------------------
345 | get-lemma ( word &key pos (noun t) porter )
346 | ----------------------------------------------------------------------
347 | Input:
348 | 
349 | - word - String of the word to find the lemma for
350 | - pos - The part of speech of the lemma to return (nil otherwise)
351 | - noun - Whether to stem nouns to the singular form
352 | - porter - Whether to use the porter algorithm if a word is unknown
353 | 
354 | Output:
355 | A string representing the lemma of the word, if found
356 | 
357 | 
358 | ----------------------------------------------------------------------
359 | get-lemma-for-id ( id &key pos (noun t) porter )
360 | ----------------------------------------------------------------------
361 | Input:
362 | 
363 | - id - The token id to find the lemma of
364 | - pos - As above
365 | - noun - ""
366 | - porter - ""
367 | 
368 | Output:
369 | The lemma id
370 | 
371 | ----------------------------------------------------------------------
372 | lemmatize ((sequence list/array) &key strip-det pos (noun t) porter last-only )
373 | ----------------------------------------------------------------------
374 | Input:
375 | 
376 | - list/array - The input sequence of token ids as a list or an array
377 | - strip-det - Remove determiners from the sequence
378 | - pos - Part of speech of root of terms
379 | - noun - Whether to stem nouns
380 | - porter - Whether to use the porter stemmer
381 | - last-only - lemmatize the last token in the sequence only
382 | 
383 | Output:
384 | Return the lemmatized list of tokens
385 | 
386 | Notes:
387 | The main method for performing lemmatization.  Valid on lists and arrays of
388 | fixnum values only.  Useful for getting the lemmatization of short phrases.
389 | 
390 | 
391 | ----------------------------------------------------------------------
392 | morph-surface-forms ( root &optional pos-class )
393 | ----------------------------------------------------------------------
394 | Input:
395 | 
396 | - root - The root form to expand
397 | - pos-class - if provided (V - verb, N - noun, A - Adverb) the class of 
398 |             surface forms to generate
399 | 
400 | Output:
401 | A list of suface ids
402 | 
403 | 
404 | ----------------------------------------------------------------------
405 | morph-surface-forms-text ( root &optional pos-class )
406 | ----------------------------------------------------------------------
407 | 
408 | String to string form of the above function
409 | 
410 | 
411 | ----------------------------------------------------------------------
412 | stopword? ( id )
413 | ----------------------------------------------------------------------
414 | Input:
415 | 
416 | - id - Input token id
417 | 
418 | Output:
419 | boolean 
420 | 
421 | ----------------------------------------------------------------------
422 | concise-stopword? ( id )
423 | ----------------------------------------------------------------------
424 | Input:
425 | 
426 | - id - Input token id
427 | 
428 | Output:
429 | boolean
430 | 
431 | ----------------------------------------------------------------------
432 | contains-is? ( ids )
433 | ----------------------------------------------------------------------
434 | Input:
435 | 
436 | - ids - a list of fixnum token ids
437 | 
438 | Output:
439 | boolean
440 | 
441 | Notes:
442 | A sometimes useful utility.  Searches the list for the token for 'is'
443 | 
444 | 
445 | ----------------------------------------------------------------------
446 | string-stopword?, string-concise-stopword?, string-contains-is? ( string )
447 | ----------------------------------------------------------------------
448 | 
449 | The three above functions but accepting string or list of string arguments
450 | 
451 | 
452 | CHUNKING
453 | ===========
454 | 
455 | ----------------------------------------------------------------------
456 | chunk ( text )
457 | ----------------------------------------------------------------------
458 | Input:
459 | 
460 | - Text - raw string text
461 | 
462 | Output:
463 | A list of phrases referencing a document created from the text
464 | 
465 | Note:
466 | Runs the tokenizer on the text prior to POS tagging
467 | 
468 | ----------------------------------------------------------------------
469 | chunk-tokenized ( text )
470 | ----------------------------------------------------------------------
471 | Input:
472 | 
473 | - text - raw string text
474 | 
475 | Output:
476 | A list of phrases referencing a document created from the text
477 | 
478 | Note:
479 | Does not run the tokenizer on text prior to POS tagging
480 | 
481 | ----------------------------------------------------------------------
482 | get-all-chunks ( doc )
483 | ----------------------------------------------------------------------
484 | Input:
485 | 
486 | - doc - a vector-document
487 | 
488 | Output:
489 | A list of chunks of all the primitive types (verb, adverb, preps and nouns)
490 | 
491 | Related functions:
492 | 
493 | - get-nx-chunks ( doc )
494 | - get-vx-chunks ( doc )
495 | - get-ax-chunks ( doc )
496 | - get-pp-chunks ( doc )
497 | - get-event-chunks ( doc )
498 | - get-verb-arg-chunks ( doc )
499 | 
500 | Notes:
501 | 
502 | - Events are concatenated verb-noun chunks
503 | - verb-arg chunks look for verb-pp-noun chunk groups
504 | 
505 | These two functions could search over sequences of phrases, but
506 | usually those are done alone and not on top of a more primitive
507 | verb, noun, adverb decomposition.  Also note that common preposition 
508 | idioms (by way of, in front of, etc) are not typically captured 
509 | properly and would need to be special cased (ie would be VP-sNP-P-NP 
510 | where sNP is a special type of NP instead of the usual VP-P-NP 
511 | verb-arg formulation)
512 | 
513 | 
514 | CONCEPTS
515 | ========
516 | 
517 | Concepts are a CLOS abstraction over token sequences that establishes
518 | identity over lemmatized phrases.  This supports special applications
519 | (ConceptNet, LifeNet) at the MIT Media Lab but might be more generally
520 | useful.
521 | 
522 | ----------------------------------------------------------------------
523 | concept
524 | ----------------------------------------------------------------------
525 | The 'concept' is a clos object with the following methods
526 | 
527 | - concept->words - Return a list of token strings
528 | - concept->string - Return a string representing the concept
529 | - concept->token-array - Return an array representing the concept
530 | - phrase->concept - Create a concept from a phrase
531 | - words->concept - Create a concept from a list of token ids
532 | - token-array->concept - ""
533 | - associate-concepts - Take a list of phrases, lists or token-arrays and find the concept
534 |    the they represent.  Returns a list of pairs of the form (phrase concept)
535 | - conceptually-equal - equal under lemmatization and with phrases, arrays of tokens
536 | - concept-contains - subset relations
537 | 
538 | 
539 | ----------------------------------------------------------------------
540 | lookup-canonical-concept-instance ( ta )
541 | ----------------------------------------------------------------------
542 | Input:
543 | 
544 | - ta - A token array or list of tokens
545 | 
546 | Output:
547 | A concept instance
548 | 
549 | 
550 | EXAMPLE USES
551 | ============
552 | 
553 | See the file example.lisp.  This shows basic use of the tagger, 
554 | tokenizer, lemmatizer and chunker interfaces.  
555 | 
556 | More examples of use can be generated if enough mail is sent to the
557 | author to invoke a guilt-driven re-release of the library with
558 | improved documentation.
559 | 
560 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
 1 | To clean up:
 2 | - Separate out utils for distribution
 3 | - Performance comparisons
 4 |   - vs. Brill C code and Allegro
 5 |   - vs. SBCL and CLISP
 6 | 
 7 | Easy enhancements:
 8 | - Anaphor resolution i/f in linear text
 9 | - Lexical bigram file support for tagger
10 | - Direct support for wordlist for tagger
11 | 
12 | Potential new work:
13 | - Alternative chunking
14 |   Label objects, actions and events from lexicon & commonsense
15 | - Commonsense constraint based parser
16 |   - Could learn ala-brill
17 |     Either a surprising verb pattern means an object can do x or y is the effect of x
18 |     or we have a new syntactic rule or word role?
19 | - Link grammar parser
20 | 
21 | 


--------------------------------------------------------------------------------
/config.sexp:
--------------------------------------------------------------------------------
 1 | ((:lexicon :relative "data/LEXICON-BROWN-AND-WSJ.txt")
 2 |  (:stems :relative "data/stem-dict.txt")
 3 |  (:stopwords :relative "data/stopwords.txt")
 4 |  (:concise-stopwords :relative "data/concise-stopwords.txt")
 5 |  (:lexical-rules :relative "data/LEXICALRULEFILE-BROWN.txt")
 6 |  (:contextual-rules :relative "data/CONTEXTUALRULEFILE-BROWN.txt")
 7 |  (:token-map :relative nil)
 8 |  (:external-token-map :relative nil)
 9 |  (:auto-init nil)
10 |  (:report-status nil))
11 | 


--------------------------------------------------------------------------------
/data/BIGRAMS.txt:
--------------------------------------------------------------------------------
1 | NOOTHING NOOTHING
2 | 


--------------------------------------------------------------------------------
/data/CONTEXTUALRULEFILE-BROWN.txt:
--------------------------------------------------------------------------------
  1 | NN VB PREVTAG TO
  2 | VB VBP PREVTAG PRP
  3 | VBD VBN PREV1OR2TAG VBD
  4 | VBN VBD PREVTAG PRP
  5 | NN VB PREV1OR2TAG MD
  6 | VB VBP PREVTAG NNS
  7 | VB NN PREV1OR2TAG DT
  8 | VBN VBD PREVTAG NNP
  9 | VBD VBN PREV1OR2OR3TAG VBZ
 10 | IN DT PREVTAG IN
 11 | VBP VB PREV1OR2OR3TAG MD
 12 | IN RB WDAND2AFT as as
 13 | VBD VBN PREV1OR2TAG VB
 14 | RB JJ NEXTTAG NN
 15 | VBP VB PREV1OR2OR3TAG TO
 16 | POS VBZ PREVTAG PRP
 17 | NN VBP PREVTAG PRP
 18 | DT PDT NEXTTAG DT
 19 | IN WDT NEXTTAG VBD
 20 | JJ NN SURROUNDTAG DT IN
 21 | VBD VBN PREV1OR2TAG VBP
 22 | NNS VBZ PREVTAG PRP
 23 | IN WDT NEXTTAG VBZ
 24 | PRP$ PRP WDNEXTTAG her IN
 25 | IN DT NEXTTAG NN
 26 | VB VBP PREVTAG WP
 27 | IN WDT NEXT1OR2TAG VB
 28 | VB VBP PREVTAG WDT
 29 | '' POS PREVTAG NNS
 30 | VBP VB PREV1OR2OR3TAG VBD
 31 | RBR JJR NEXTTAG NN
 32 | VBG NN PREVTAG JJ
 33 | VB VBN PREVTAG VBD
 34 | PRP$ PRP WDNEXTTAG her .
 35 | EX RB NEXT1OR2TAG .
 36 | NNP JJ SURROUNDTAG STAART NNS
 37 | NNP JJ SURROUNDTAG STAART NN
 38 | VB NN PREVTAG JJ
 39 | VBN VBD PREVTAG WP
 40 | NNS VBZ NEXTTAG DT
 41 | VBZ NNS PREVTAG JJ
 42 | VBN VBD CURWD had
 43 | VBD VBN NEXTWD by
 44 | VBP VB PREV1OR2TAG VBP
 45 | VB NN PREVTAG IN
 46 | POS VBZ NEXTTAG DT
 47 | VBN VBD CURWD said
 48 | VBP VB PREV2TAG VB
 49 | RB IN WDNEXTTAG so PRP
 50 | JJ VB PREVTAG TO
 51 | NNP NN SURROUNDTAG STAART IN
 52 | NN VBG NEXTTAG DT
 53 | VBN VBD SURROUNDTAG NN DT
 54 | RB JJ NEXTTAG NNS
 55 | RBR JJR NEXTTAG NNS
 56 | '' POS PREV1OR2TAG NNP
 57 | VBN VBD PREVWD and
 58 | VBD VBN PREVBIGRAM VBN CC
 59 | JJ NN SURROUNDTAG JJ IN
 60 | JJ NNP NEXTTAG NNP
 61 | PRP$ PRP RBIGRAM her ,
 62 | VBN VBD PREVBIGRAM PRP RB
 63 | EX RB NEXTTAG ,
 64 | VBN VBD SURROUNDTAG , DT
 65 | NN VB PREVTAG RB
 66 | NN JJ SURROUNDTAG DT NN
 67 | VBN VBD PREVTAG WDT
 68 | NN NNP NEXTTAG NNP
 69 | VB NN PREVTAG PRP$
 70 | NN VBP PREVTAG NNS
 71 | PRP$ PRP NEXTTAG TO
 72 | VB VBP PREVBIGRAM NNS RB
 73 | VBG NN SURROUNDTAG DT IN
 74 | PRP$ PRP WDNEXTTAG her RB
 75 | VBN VBD PREVWD that
 76 | VB VBP PREVBIGRAM PRP RB
 77 | VB VBN PREVTAG VBZ
 78 | VB VBP WDPREVTAG NN have
 79 | VB NN PREVTAG NN
 80 | RBR JJR NEXTTAG IN
 81 | VBN VBD CURWD got
 82 | RBS JJS PREVTAG IN
 83 | IN VB PREVTAG MD
 84 | CD LS SURROUNDTAG STAART .
 85 | VBN VBD SURROUNDTAG NN PRP
 86 | EX RB NEXTTAG IN
 87 | JJ NN SURROUNDTAG DT .
 88 | DT RB NEXTWD right
 89 | POS VBZ PREVTAG DT
 90 | VB VBN PREV1OR2WD have
 91 | JJ VB PREVTAG MD
 92 | CD LS SURROUNDTAG -LRB- -RRB-
 93 | IN DT NEXT1OR2OR3TAG .
 94 | CD NN NEXTTAG MD
 95 | JJ RB NEXTTAG VBN
 96 | JJ RB WDNEXTTAG long IN
 97 | NN VB PREVBIGRAM VB CC
 98 | VBD NN PREVTAG DT
 99 | VBN VBD SURROUNDTAG NNS DT
100 | DT UH NEXTTAG ,
101 | NN VBG PREVTAG VBD
102 | CD NN NEXTTAG VBZ
103 | JJ NN NEXTTAG VBZ
104 | JJ NN SURROUNDTAG JJ .
105 | VBN VBD PREVBIGRAM NNP RB
106 | IN WDT NEXTTAG VBD
107 | IN RB WDNEXTTAG as RB
108 | VB VBP LBIGRAM , have
109 | NNS VBZ PREVTAG RB
110 | PRP$ PRP WDNEXTTAG her DT
111 | JJ RB WDNEXTTAG right RB
112 | IN NN WDPREVTAG DT while
113 | VB VBP PREVTAG NNPS
114 | NN VBP PREVWD who
115 | NNP NN SURROUNDTAG STAART VBZ
116 | IN RB SURROUNDTAG VBD CC
117 | DT RB NEXTWD longer
118 | RB NN LBIGRAM the back
119 | IN WDT NEXTTAG VBP
120 | NNP NN SURROUNDTAG STAART NNS
121 | IN RB WDAND2AFT As as
122 | IN DT NEXTWD 's
123 | WDT DT PREVTAG CC
124 | JJ JJR CURWD further
125 | JJ NN RBIGRAM right to
126 | VBN VBD SURROUNDTAG NN PRP$
127 | POS VBZ PREVTAG DT
128 | NNS NN WDPREVTAG DT means
129 | RBS JJS NEXTWD of
130 | IN JJ PREVWD the
131 | VBP VBD PREV1OR2WD he
132 | VBD VBN PREVTAG IN
133 | JJ RB WDNEXTTAG little JJR
134 | VBZ NNS PREVWD the
135 | VB VBN PREV1OR2WD be
136 | POS PRP PREVTAG VB
137 | '' NN RBIGRAM '' ''
138 | NN JJ PREVTAG RB
139 | IN RB SURROUNDTAG VBD .
140 | JJ RB CURWD then
141 | VB VBP WDPREVTAG NNP have
142 | DT IN NEXTTAG PRP
143 | WP WDT NEXTTAG NN
144 | VB NN PREVBIGRAM MD VB
145 | NNP VB SURROUNDTAG STAART DT
146 | POS VBZ PREVTAG WP
147 | POS VBZ PREVTAG EX
148 | VBN VBD SURROUNDTAG NN NN
149 | NN JJ PREVWD is
150 | VBD VBN LBIGRAM had had
151 | NN VB SURROUNDTAG CC DT
152 | CD NN NEXTTAG WP
153 | DT IN WDAND2TAGAFT that NNS
154 | WDT DT NEXTBIGRAM VBZ ,
155 | VBD NN PREVTAG JJ
156 | JJ NN NEXTTAG VBD
157 | VBN VB PREVTAG TO
158 | JJ NN SURROUNDTAG IN IN
159 | NN JJ CURWD first
160 | JJ NN SURROUNDTAG JJ ,
161 | NN JJ CURWD few
162 | NNP NN SURROUNDTAG STAART NN
163 | NNP JJ SURROUNDTAG STAART JJ
164 | NNP JJ CURWD Federal
165 | JJ RB WDNEXTTAG early IN
166 | VB VBP PREVTAG DT
167 | VBP VB PREV1OR2TAG VBZ
168 | EX RB NEXT1OR2TAG PRP
169 | VBN NN RBIGRAM set of
170 | VBG NNP PREVTAG NNP
171 | VBD VBN PREVBIGRAM IN DT
172 | VBG NN NEXTWD room
173 | NNS VBZ PREVTAG WDT
174 | VB NN NEXTWD of
175 | RB IN WDNEXTTAG so DT
176 | JJ VBN NEXTWD by
177 | NN JJ WDNEXTTAG future NNS
178 | JJ RB WDNEXTTAG right IN
179 | NN VB PREVTAG MD
180 | JJ NNP NEXTTAG NNP
181 | VBZ NNS PREVTAG PRP$
182 | CD NN RBIGRAM one 's
183 | CD NN WDNEXTTAG one VBD
184 | RB NN WDPREVTAG PRP$ back
185 | DT RB NEXT1OR2WD than
186 | NN FW PREV1OR2OR3TAG FW
187 | VBD JJ PREVTAG PRP$
188 | VBN VBD SURROUNDTAG NNS PRP
189 | NN JJ PREV1OR2WD are
190 | NN JJ WDAND2TAGAFT past NNS
191 | JJ RB NEXTTAG VBD
192 | RB IN WDNEXTTAG So PRP
193 | NN JJ PREVWD be
194 | NN VB CURWD go
195 | VBG NN SURROUNDTAG DT ,
196 | VBP VB PREV1OR2WD n't
197 | NNS VBZ PREVWD that
198 | RB JJ WDPREVTAG DT only
199 | IN WDT NEXTTAG MD
200 | VBN VBD PREVTAG NNP
201 | VBN VBD PREVTAG NNPS
202 | NN JJ PREVBIGRAM JJ CC
203 | VB VBP PREVTAG EX
204 | NN VBG PREVTAG RB
205 | NNS VBZ PREVTAG WP
206 | VB NN PREVTAG POS
207 | VBZ NNS PREVTAG CD
208 | DT IN NEXTTAG DT
209 | IN DT PREVTAG TO
210 | RBR JJR NEXTTAG TO
211 | JJ VB PREVBIGRAM MD RB
212 | VB VBN PREVBIGRAM VBD RB
213 | JJR RB WDNEXTTAG better VB
214 | NNP VB SURROUNDTAG STAART PRP
215 | JJ NN NEXTBIGRAM MD VB
216 | VBN VBD SURROUNDTAG NNS PRP$
217 | NN JJ SURROUNDTAG DT NNS
218 | IN RB RBIGRAM up ,
219 | JJ RB PREVTAG MD
220 | 


--------------------------------------------------------------------------------
/data/CONTEXTUALRULEFILE-WSJ-NOLEX.txt:
--------------------------------------------------------------------------------
  1 | NN VB PREVTAG TO
  2 | VBP VB PREV1OR2OR3TAG MD
  3 | NN VB PREV1OR2TAG MD
  4 | VB NN PREV1OR2TAG DT
  5 | VBD VBN PREV1OR2OR3TAG VBZ
  6 | VBN VBD PREVTAG PRP
  7 | VBN VBD PREVTAG NNP
  8 | VBD VBN PREVTAG VBD
  9 | VBP VB PREVTAG TO
 10 | POS VBZ PREVTAG PRP
 11 | VB VBP PREVTAG NNS
 12 | VBD VBN PREV1OR2OR3TAG VBP
 13 | IN WDT NEXT1OR2TAG VB
 14 | VBD VBN PREV1OR2TAG VB
 15 | VB VBP PREVTAG PRP
 16 | IN WDT NEXTTAG VBZ
 17 | IN DT NEXTTAG NN
 18 | JJ NNP NEXTTAG NNP
 19 | IN WDT NEXTTAG VBD
 20 | JJR RBR NEXTTAG JJ
 21 | JJ NN SURROUNDTAG DT IN
 22 | IN WDT NEXTTAG VBP
 23 | VBP VB PREVBIGRAM VBP RB
 24 | NNS VBZ PREVTAG PRP
 25 | VBN VBD SURROUNDTAG NN DT
 26 | JJS RBS NEXTTAG JJ
 27 | POS VBZ NEXT1OR2TAG DT
 28 | NNP NN SURROUNDTAG STAART NNS
 29 | IN RB NEXTBIGRAM RB IN
 30 | VB NN PREV1OR2TAG IN
 31 | IN RB NEXTBIGRAM JJ IN
 32 | NNS VBZ NEXTTAG DT
 33 | VBG NN PREVTAG JJ
 34 | VB VBP PREVTAG WDT
 35 | NN VBP PREVTAG NNS
 36 | VBN VBD PREVTAG WP
 37 | VB NN PREVTAG NN
 38 | JJ NN NEXTTAG VBD
 39 | NN VBP PREVTAG PRP
 40 | VBN VBD SURROUNDTAG NNS DT
 41 | NN VBG NEXTTAG DT
 42 | RB JJ SURROUNDTAG DT NN
 43 | NN VB PREVTAG RB
 44 | VB VBP PREVBIGRAM NNS RB
 45 | NNP JJ SURROUNDTAG STAART NN
 46 | VB NN PREV1OR2TAG POS
 47 | VBN VBD SURROUNDTAG , DT
 48 | JJ VB PREVTAG TO
 49 | IN WDT NEXTTAG VBN
 50 | VBN VBD PREVTAG WDT
 51 | JJ NN SURROUNDTAG JJ IN
 52 | RB RP NEXTTAG DT
 53 | VB VBN PREVTAG VBZ
 54 | VBD VBN PREVBIGRAM VBD RB
 55 | JJ RB NEXTTAG JJR
 56 | VBP VB PREVBIGRAM VBZ RB
 57 | VBP VB PREVBIGRAM VBD RB
 58 | JJR RBR NEXTTAG RB
 59 | DT IN NEXTBIGRAM NN NNS
 60 | JJ NN SURROUNDTAG DT .
 61 | POS VBZ PREVTAG IN
 62 | JJ VB PREVTAG MD
 63 | VBN VBD PREVBIGRAM NNP RB
 64 | VB NN PREVTAG JJ
 65 | VBN VB PREVTAG TO
 66 | VB VBP PREVTAG WP
 67 | IN DT PREVTAG IN
 68 | NNS NNP NEXTTAG NNP
 69 | VBG NN SURROUNDTAG DT IN
 70 | NN NNP NEXTTAG NNP
 71 | VBN VBD PREVBIGRAM PRP RB
 72 | RB IN NEXTTAG TO
 73 | JJ RB NEXTTAG VBN
 74 | VBD VBN PREVTAG IN
 75 | VBD VBN PREVTAG DT
 76 | NNP JJ SURROUNDTAG STAART NNS
 77 | POS VBZ PREVTAG WP
 78 | VBD VBN NEXTBIGRAM NNP CD
 79 | IN DT NEXTTAG VBZ
 80 | VB VBN PREVTAG VBP
 81 | NNS VBZ PREVTAG RB
 82 | IN WDT NEXTTAG MD
 83 | POS VBZ PREVTAG DT
 84 | NN PDT NEXTTAG DT
 85 | JJR RBR NEXT1OR2TAG VBN
 86 | POS '' NEXT1OR2TAG ''
 87 | NNPS NNP NEXTTAG NNP
 88 | VBP NN PREVTAG DT
 89 | JJ NN SURROUNDTAG JJ .
 90 | WDT DT PREVTAG CC
 91 | JJ NN SURROUNDTAG DT ,
 92 | VBN VBD SURROUNDTAG NN JJ
 93 | RBR JJR NEXT1OR2TAG NNS
 94 | JJ VBN SURROUNDTAG STAART IN
 95 | VBP VB PREV2TAG VB
 96 | JJ NN NEXTTAG VBZ
 97 | VBZ NNS PREVTAG JJ
 98 | NN VB PREVTAG TO
 99 | VB NN PREVTAG VB
100 | JJ NN SURROUNDTAG IN .
101 | VBN VBD SURROUNDTAG NN NN
102 | VBN VBD SURROUNDTAG CC DT
103 | JJS RBS NEXTTAG RB
104 | DT WDT SURROUNDTAG NN VBZ
105 | NN CD PREVTAG $
106 | JJ NN NEXTBIGRAM MD VB
107 | VB VBP PREVBIGRAM PRP RB
108 | NNP NNS SURROUNDTAG STAART IN
109 | RBS JJS PREVTAG IN
110 | IN RB SURROUNDTAG , ,
111 | NNS VBZ PREVBIGRAM , WDT
112 | NN VBP PREVTAG WP
113 | NNS NN PREVBIGRAM CD CD
114 | VBN VBD PREVBIGRAM NN RB
115 | NN VBG NEXTTAG PRP$
116 | NNPS NNS PREVTAG STAART
117 | DT WDT PREVTAG NNS
118 | NN VBP PREVTAG WDT
119 | RB RP NEXTTAG PRP$
120 | VBP NN PREVTAG JJ
121 | RP IN PREV1OR2TAG ,
122 | IN JJ SURROUNDTAG DT NN
123 | IN VB NEXT2TAG VB
124 | IN NN PREVTAG DT
125 | JJ NN NEXTTAG POS
126 | VBN VBD SURROUNDTAG NNS NNS
127 | VB NN PREV1OR2TAG PRP$
128 | NN VB PREVTAG MD
129 | VB JJ PREVTAG DT
130 | DT RB NEXTTAG RBR
131 | VB VBN PREVTAG VBD
132 | POS VBZ PREVTAG EX
133 | VBD VBN PREVTAG VBN
134 | RB JJ NEXTTAG NNS
135 | RBR JJR NEXTTAG NN
136 | EX RB NEXT1OR2TAG IN
137 | WDT DT PREVTAG IN
138 | VBZ POS PREVTAG NNP
139 | VBZ NNS SURROUNDTAG NN .
140 | VBP VB PREV1OR2OR3TAG MD
141 | NNP NNS SURROUNDTAG STAART VBP
142 | VBG NNP PREVTAG NNP
143 | POS '' PREV1OR2OR3TAG ``
144 | NN VBG PREVTAG VBD
145 | VBG NN PREVTAG PRP$
146 | NNS VBZ SURROUNDTAG NNP TO
147 | VBZ NNS PREVTAG PRP$
148 | IN DT NEXT1OR2OR3TAG STAART
149 | EX RB NEXTTAG .
150 | VBN VBD SURROUNDTAG NNS PRP$
151 | VBN VBD SURROUNDTAG NNS JJ
152 | JJ RB PREVTAG MD
153 | NNS VBZ SURROUNDTAG , IN
154 | WDT IN NEXTTAG PRP
155 | RB NNP PREVTAG NNP
156 | JJS RBS NEXTTAG VBN
157 | VB RB NEXTTAG TO
158 | JJ NN SURROUNDTAG JJ ,
159 | VBN VBD PREVBIGRAM , CC
160 | VBN VBD SURROUNDTAG NN PRP$
161 | VBN VBD SURROUNDTAG NNS NN
162 | NN VBG PREVTAG VBP
163 | JJ RB NEXTTAG RBR
164 | VBG NN SURROUNDTAG DT ,
165 | NNS VBZ NEXTBIGRAM JJ NNS
166 | VBZ NNS NEXTTAG VBP
167 | JJ NNP NEXTTAG POS
168 | JJ NN NEXTBIGRAM CC NN
169 | JJ NN SURROUNDTAG PRP$ IN
170 | JJ NN SURROUNDTAG NN IN
171 | NN JJ SURROUNDTAG STAART NNS
172 | IN RB NEXTBIGRAM . STAART
173 | RB IN SURROUNDTAG NN JJ
174 | JJ NN SURROUNDTAG IN IN
175 | NN VBG NEXTBIGRAM JJ NNS
176 | NNS NNPS PREVBIGRAM DT NNP
177 | PRP$ PRP NEXTTAG IN
178 | 


--------------------------------------------------------------------------------
/data/CONTEXTUALRULEFILE-WSJ.txt:
--------------------------------------------------------------------------------
  1 | NN VB PREVTAG TO
  2 | VBP VB PREV1OR2OR3TAG MD
  3 | NN VB PREV1OR2TAG MD
  4 | VB NN PREV1OR2TAG DT
  5 | VBD VBN PREV1OR2OR3TAG VBZ
  6 | VBN VBD PREVTAG PRP
  7 | VBN VBD PREVTAG NNP
  8 | VBD VBN PREVTAG VBD
  9 | VBP VB PREVTAG TO
 10 | POS VBZ PREVTAG PRP
 11 | VB VBP PREVTAG NNS
 12 | IN RB WDAND2AFT as as
 13 | VBD VBN PREV1OR2WD have
 14 | IN WDT NEXT1OR2TAG VB
 15 | VB VBP PREVTAG PRP
 16 | VBP VB PREV1OR2WD n't
 17 | IN WDT NEXTTAG VBZ
 18 | IN DT NEXTTAG NN
 19 | JJ NNP NEXTTAG NNP
 20 | IN WDT NEXTTAG VBD
 21 | JJ NN NEXTWD of
 22 | VBD VBN PREV1OR2WD be
 23 | JJR RBR NEXTTAG JJ
 24 | IN WDT NEXTTAG VBP
 25 | JJS RBS WDNEXTTAG most JJ
 26 | VBN VBD SURROUNDTAG NN DT
 27 | NNS VBZ PREVTAG PRP
 28 | POS VBZ NEXT1OR2TAG DT
 29 | NNP NN SURROUNDTAG STAART NNS
 30 | VBD VBN NEXTWD by
 31 | VB NN PREV1OR2TAG IN
 32 | VB VBP PREVTAG WDT
 33 | VBG NN PREVTAG JJ
 34 | NNS VBZ NEXTTAG DT
 35 | VBN VBD PREVTAG WP
 36 | NN VBP PREVTAG NNS
 37 | VB NN PREVTAG NN
 38 | NN VB PREVWD n't
 39 | NN VBG NEXTTAG DT
 40 | RB JJ NEXTTAG NN
 41 | NN VBP PREVTAG PRP
 42 | VBN VBD SURROUNDTAG NNS DT
 43 | VB NN PREV1OR2TAG POS
 44 | JJ NN NEXTTAG VBD
 45 | RB RP WDNEXTTAG up DT
 46 | JJ VB PREVTAG TO
 47 | VBN VBD SURROUNDTAG , DT
 48 | VBN VBD PREVWD that
 49 | VB VBP PREVBIGRAM NNS RB
 50 | NNP JJ SURROUNDTAG STAART NN
 51 | VB VBN PREVTAG VBZ
 52 | NNP JJ WDNEXTTAG American NNS
 53 | JJ RB NEXTTAG JJR
 54 | NNS NN CURWD yen
 55 | IN WDT NEXTTAG VBD
 56 | DT IN WDAND2TAGAFT that NNS
 57 | POS VBZ PREVWD that
 58 | JJ VB PREVTAG MD
 59 | VB NN PREVTAG JJ
 60 | JJR RBR NEXTTAG RB
 61 | VBD VBN PREV1OR2WD are
 62 | NN JJ WDNEXTTAG executive NN
 63 | NNP JJ WDNEXTTAG American NN
 64 | VBN VBD PREVTAG WDT
 65 | VBD VBN PREVBIGRAM VBD RB
 66 | JJ NN SURROUNDTAG DT .
 67 | NNP JJ NEXTWD German
 68 | VBN VB PREVTAG TO
 69 | VBN VBD PREVBIGRAM NNP RB
 70 | RB IN RBIGRAM up to
 71 | VB VBP PREVTAG WP
 72 | JJ NN SURROUNDTAG DT IN
 73 | IN DT NEXTWD 's
 74 | VBD VBN WDNEXTTAG ended NNP
 75 | VBD VBN SURROUNDTAG DT NN
 76 | NNS NNP NEXTTAG NNP
 77 | NN NNP NEXTTAG NNP
 78 | VBG NN SURROUNDTAG DT IN
 79 | NNP JJ SURROUNDTAG STAART NNS
 80 | RB RP WDPREVTAG VB up
 81 | VBN VBD PREVBIGRAM PRP RB
 82 | JJ RB NEXTTAG VBN
 83 | NN VBP PREVTAG RB
 84 | NNS VBZ PREVTAG RB
 85 | POS VBZ PREVTAG WP
 86 | VB VBN PREVWD have
 87 | NN PDT WDNEXTTAG half DT
 88 | IN DT PREVTAG IN
 89 | IN WDT NEXTTAG MD
 90 | POS VBZ PREVTAG DT
 91 | NN NNP CURWD Integrated
 92 | POS '' NEXT1OR2TAG ''
 93 | VBD VBN PREVTAG IN
 94 | JJR RBR NEXT1OR2TAG VBN
 95 | JJS RBS WDNEXTTAG most RB
 96 | JJ NN SURROUNDTAG JJ IN
 97 | VBZ NNS PREVTAG JJ
 98 | NNS VBZ WDPREVTAG JJ is
 99 | JJ NN NEXTTAG VBZ
100 | VBP NN PREVTAG DT
101 | JJ NN SURROUNDTAG JJ .
102 | NNPS NNP NEXTTAG NNP
103 | WDT DT PREVTAG CC
104 | RB IN WDNEXTTAG so PRP
105 | VBP NN PREVWD earnings
106 | NN VBG PREVWD is
107 | NNS VBZ PREV1OR2WD Mr.
108 | VBZ NNS PREVWD the
109 | RB RP WDPREVTAG VBN up
110 | NNPS NNS PREVTAG STAART
111 | VBN VBD SURROUNDTAG NN JJ
112 | VBP VB PREV2TAG VB
113 | RBR JJR NEXTTAG NNS
114 | JJ NN SURROUNDTAG DT ,
115 | JJ NN SURROUNDTAG IN .
116 | NN VB PREVTAG TO
117 | VB NN PREVTAG VB
118 | NN VBP PREVWD who
119 | RB RP WDPREVTAG VBG up
120 | NN RB WDNEXTTAG right RB
121 | VBZ POS WDPREVTAG NNP 's
122 | JJ RP WDNEXTTAG up NN
123 | VBN VBD SURROUNDTAG NN NN
124 | VBN VBD SURROUNDTAG CC DT
125 | JJ NN NEXTBIGRAM MD VB
126 | JJ RB WDNEXTTAG early IN
127 | JJ VBN SURROUNDTAG STAART IN
128 | IN RB RBIGRAM though ,
129 | VBD VBN PREV1OR2WD been
130 | DT PDT WDNEXTTAG all DT
131 | VBN VBD PREVBIGRAM NN RB
132 | NN VB PREVWD help
133 | VBP VB PREV1OR2WD not
134 | VBP NN PREVTAG JJ
135 | DT WDT PREVTAG NNS
136 | NN VBP PREVTAG WDT
137 | VB RB RBIGRAM close to
138 | NNS VBZ PREVBIGRAM , WDT
139 | IN RP WDNEXTTAG out DT
140 | DT RB NEXTWD longer
141 | IN JJ SURROUNDTAG DT NN
142 | DT WDT SURROUNDTAG NN VBZ
143 | IN VB NEXT2TAG VB
144 | IN NN PREVTAG DT
145 | VBN VBD SURROUNDTAG NNS NNS
146 | IN RB RBIGRAM about $
147 | EX RB NEXT1OR2TAG IN
148 | NN VBG NEXTTAG PRP$
149 | NN VBG CURWD living
150 | VBZ NNS PREVTAG PRP$
151 | RBR JJR NEXTTAG NN
152 | RBR JJR CURWD higher
153 | VB VBP PREVBIGRAM PRP RB
154 | NN VB PREVTAG MD
155 | VB NN PREV1OR2TAG PRP$
156 | RP IN PREV1OR2TAG ,
157 | VB JJ PREVTAG DT
158 | DT IN PREVWD out
159 | POS VBZ PREVTAG EX
160 | JJ NN NEXTTAG POS
161 | NN JJ CURWD first
162 | VBD VBN PREVWD the
163 | NNS VBZ WDPREVTAG NNP plans
164 | NNP NNS SURROUNDTAG STAART IN
165 | RB JJ NEXTTAG NNS
166 | JJ RB CURWD just
167 | VBP NN PREVWD sales
168 | NNS NNPS PREVWD Orange
169 | VB VBN PREVTAG VBD
170 | WDT DT PREVTAG IN
171 | NN JJ WDNEXTTAG right NN
172 | NN VBG WDNEXTTAG operating IN
173 | JJ VBN CURWD insured
174 | JJ NNP LBIGRAM STAART U.S.
175 | IN DT NEXT1OR2OR3TAG STAART
176 | POS '' PREV1OR2OR3TAG ``
177 | NN JJ WDNEXTTAG official NN
178 | NNP JJ CURWD Irish
179 | JJ RB NEXTTAG RBR
180 | VBG NN WDPREVTAG DT selling
181 | VBP VB PREV1OR2OR3TAG MD
182 | WDT IN NEXTTAG PRP
183 | EX RB NEXTTAG .
184 | VBN VBD SURROUNDTAG NNS PRP$
185 | VBN VBD CURWD said
186 | JJ RB PREVTAG MD
187 | NN VBG NEXTBIGRAM JJ NNS
188 | JJ RB WDNEXTTAG late IN
189 | VBG NN PREVTAG PRP$
190 | VBZ NNS NEXTTAG VBP
191 | NN NNP WDPREVTAG DT CD
192 | NN VBN PREVWD be
193 | JJS RBS NEXTTAG VBN
194 | VBN VBD SURROUNDTAG NN PRP$
195 | VBN VBD SURROUNDTAG NNS JJ
196 | VBN VBD SURROUNDTAG NNS NN
197 | VBD VBN WDNEXTTAG increased NN
198 | VBZ NNS NEXTWD of
199 | IN RP WDAND2TAGAFT out NNS
200 | JJ NNP NEXTTAG POS
201 | RB RP WDNEXTTAG down DT
202 | CD NNS CURWD 1970s
203 | VBG NNP CURWD Working
204 | VBN VB PREVTAG MD
205 | JJ NN NEXTBIGRAM CC NN
206 | NN JJ SURROUNDTAG STAART NNS
207 | VBN VBD PREVBIGRAM , CC
208 | IN RB NEXTBIGRAM . STAART
209 | NN VBG PREVWD was
210 | NNP NNPS CURWD Cowboys
211 | VBZ NNS PREVWD phone
212 | NNP NNS SURROUNDTAG STAART VBP
213 | RBR JJR WDNEXTTAG lower JJ
214 | PRP$ PRP NEXTTAG IN
215 | VBD VB PREVTAG TO
216 | JJ NN WDPREVTAG NN chief
217 | JJ NN SURROUNDTAG JJ ,
218 | NN JJ WDPREVTAG DT third
219 | VBN VBD SURROUNDTAG NNS NNP
220 | NNP NN SURROUNDTAG STAART NN
221 | NNP NN CURWD HDTV
222 | VBG NN SURROUNDTAG DT ,
223 | VBG NN SURROUNDTAG DT .
224 | NNS VBZ PREVTAG WP
225 | NN VB SURROUNDTAG CC DT
226 | NNPS NNP WDAND2TAGBFR IN Securities
227 | RP IN PREVTAG NNS
228 | VBP NN LBIGRAM funds rate
229 | VBP NN WDPREVTAG NNS market
230 | DT RB RBIGRAM either .
231 | VBN NN SURROUNDTAG DT IN
232 | VBD VB PREV1OR2OR3TAG MD
233 | NN JJ NEXTWD oil
234 | VBN VBD SURROUNDTAG , $
235 | VBD VBN PREVBIGRAM DT RB
236 | VBN JJ PREVWD by
237 | NNP JJ WDNEXTTAG American JJ
238 | NN VBG PREVTAG VBP
239 | JJ RB LBIGRAM very much
240 | NN VBG RBIGRAM operating officer
241 | RB IN RBIGRAM up for
242 | NNS VBZ NEXTBIGRAM JJ NNS
243 | NNS VBZ SURROUNDTAG , IN
244 | VB VBP PREVTAG NNPS
245 | IN RP WDAND2TAGAFT out IN
246 | NNPS NNP PREVBIGRAM CC NNP
247 | NN RB RBIGRAM close to
248 | RBR RB PREVWD no
249 | JJ VBD NEXTTAG DT
250 | RB NNP PREVTAG NNP
251 | MD NN PREVWD good
252 | JJ NN WDPREVTAG NN giant
253 | NN JJ WDNEXTTAG official NNS
254 | VBN VBD SURROUNDTAG , PRP$
255 | VBN VBD SURROUNDTAG , RB
256 | VBN VBD SURROUNDTAG NN PRP
257 | NNP JJ WDNEXTTAG South JJ
258 | NN VBG PREVTAG RB
259 | NNS VBZ SURROUNDTAG , TO
260 | VBZ NNS SURROUNDTAG NN .
261 | NN VB NEXTTAG PRP$
262 | VBP VB PREV1OR2WD do
263 | VB JJ NEXTWD countries
264 | IN WDT NEXTBIGRAM RB VBZ
265 | JJ VB NEXTTAG DT
266 | WDT DT NEXTBIGRAM VBZ ,
267 | NNP RB RBIGRAM First ,
268 | DT NNP WDNEXTTAG A VBZ
269 | JJ RBR RBIGRAM further ,
270 | CD PRP WDNEXTTAG one MD
271 | POS '' PREV1OR2OR3TAG .
272 | PRP NN PREVTAG -LRB-
273 | VBN VBD SURROUNDTAG , PRP
274 | VBN VBD SURROUNDTAG NN NNS
275 | VBN VBD SURROUNDTAG NN RP
276 | NNP NN LBIGRAM STAART Business
277 | VBD VBN PREVTAG VBG
278 | IN RB RBIGRAM before ,
279 | IN RB WDAND2AFT As as
280 | NNP JJ LBIGRAM New York-based
281 | NNP JJ CURWD Mexican
282 | NNP NNPS WDNEXTTAG Motors NNP
283 | NNP NNPS WDPREVTAG NNP Enterprises
284 | JJ RB WDNEXTTAG long IN
285 | 


--------------------------------------------------------------------------------
/data/LEXICALRULEFILE-BROWN.txt:
--------------------------------------------------------------------------------
  1 | NN s fhassuf 1 NNS x
  2 | ed hassuf 2 VBN x
  3 | ing hassuf 3 VBG x
  4 | ly hassuf 2 RB x
  5 | ly addsuf 2 JJ x
  6 | - char JJ x
  7 | 1 char CD x
  8 | NN ic fhassuf 2 JJ x
  9 | NN to fgoodright VB x
 10 | NNS ss fhassuf 2 NN x
 11 | NN ble fhassuf 3 JJ x
 12 | NN al fhassuf 2 JJ x
 13 | 0 char CD x
 14 | VB the fgoodright NN x
 15 | NNS us fhassuf 2 JJ x
 16 | NN ive fhassuf 3 JJ x
 17 | he goodright VBD x
 18 | un deletepref 2 JJ x
 19 | 2 char CD x
 20 | NN y fdeletesuf 1 JJ x
 21 | JJ of fgoodleft NN x
 22 | NN est fhassuf 3 JJS x
 23 | ts hassuf 2 NNS x
 24 | 4 char CD x
 25 | NN less fhassuf 4 JJ x
 26 | NNS the fgoodleft VBZ x
 27 | NNS is fhassuf 2 NN x
 28 | VBD s fhassuf 1 VBZ x
 29 | men deletesuf 3 NNS x
 30 | ness addsuf 4 JJ x
 31 | ions hassuf 4 NNS x
 32 | NNP the fgoodleft IN x
 33 | JJ rs fhassuf 2 NNS x
 34 | ly hassuf 2 RB x
 35 | NN ary fhassuf 3 JJ x
 36 | 1 addpref 1 CD x
 37 | NN his fgoodleft IN x
 38 | JJ ion fhassuf 3 NN x
 39 | years goodleft CD x
 40 | ment addsuf 4 VB x
 41 | NN had fgoodright VBN x
 42 | ful hassuf 3 JJ x
 43 | NN ize fhassuf 3 VB x
 44 | NN lar fhassuf 3 JJ x
 45 | CD h fhassuf 1 JJ x
 46 | CD a fchar JJ x
 47 | JJ B fchar NNP x
 48 | NN was fgoodright JJ x
 49 | VBN ked fhassuf 3 VBD x
 50 | JJ M fchar NNP x
 51 | ness hassuf 4 NN x
 52 | NN ate fhassuf 3 VB x
 53 | VB the fgoodright NN x
 54 | NN than fgoodleft JJR x
 55 | VBZ the fgoodright NNS x
 56 | NNP ans fhassuf 3 NNPS x
 57 | these goodleft IN x
 58 | VBN He fgoodright VBD x
 59 | NN 3 fchar CD x
 60 | 'll goodleft PRP x
 61 | ing deletesuf 3 VBG x
 62 | NN a fdeletepref 1 RB x
 63 | JJ es fhassuf 2 NNS x
 64 | JJ D fchar NNP x
 65 | like deletesuf 4 JJ x
 66 | NN tory fhassuf 4 JJ x
 67 | JJ is fgoodleft NN x
 68 | VBD have fgoodleft MD x
 69 | NNS ates fhassuf 4 VBZ x
 70 | VBG is fgoodleft NN x
 71 | some hassuf 4 JJ x
 72 | JJS s faddsuf 1 NN x
 73 | MD was fgoodright RB x
 74 | NN id fhassuf 2 JJ x
 75 | own goodleft PRP$ x
 76 | JJ s fdeletesuf 1 NNS x
 77 | ity hassuf 3 NN x
 78 | NN ish fdeletesuf 3 JJ x
 79 | JJ nce fhassuf 3 NN x
 80 | fies hassuf 4 VBZ x
 81 | ors hassuf 3 NNS x
 82 | VBN ped fhassuf 3 VBD x
 83 | NNP ses fhassuf 3 NNPS x
 84 | NN ity faddsuf 3 JJ x
 85 | JJ W fchar NNP x
 86 | wise deletesuf 4 RB x
 87 | 7 char CD x
 88 | izes hassuf 4 VBZ x
 89 | self deletesuf 4 PRP x
 90 | JJ G fchar NNP x
 91 | VBN owed fhassuf 4 VBD x
 92 | VB a fgoodright NN x
 93 | JJ H fchar NNP x
 94 | 'll goodright VB x
 95 | JJ 's fgoodleft NN x
 96 | five hassuf 4 CD x
 97 | teen hassuf 4 CD x
 98 | NNS oses fhassuf 4 VBZ x
 99 | body addsuf 4 DT x
100 | VBG of fgoodleft NN x
101 | NNS es faddsuf 2 NN x
102 | lent hassuf 4 JJ x
103 | NNP ces fhassuf 3 NNS x
104 | JJ sal fhassuf 3 NN x
105 | JJ mble fhassuf 4 NN x
106 | JJ I fchar NNP x
107 | enth hassuf 4 JJ x
108 | ify hassuf 3 VB x
109 | VBN eed fhassuf 3 NN x
110 | JJ er fhassuf 2 NN x
111 | NN ward fdeletesuf 4 RB x
112 | VBD are fgoodright VBN x
113 | VBN kled fhassuf 4 VBD x
114 | NNS en fdeletepref 2 VBZ x
115 | gled hassuf 4 VBD x
116 | JJ Y fchar NN x
117 | JJ re- fdeletepref 3 NN x
118 | JJ ery fhassuf 3 NN x
119 | Br haspref 2 NNP x
120 | CD p fchar JJ x
121 | CD . fhassuf 1 JJ x
122 | VBN Un fdeletepref 2 JJ x
123 | NN ky fhassuf 2 JJ x
124 | NN ern fdeletesuf 3 JJ x
125 | NN tty fhassuf 3 JJ x
126 | VBN scr fhaspref 3 VBD x
127 | udes hassuf 4 VBZ x
128 | VBZ the fgoodright NNS x
129 | NNS which fgoodright VBZ x
130 | NNPS l fchar NNS x
131 | Some deletepref 4 RB x
132 | body deletesuf 4 NN x
133 | NN most fhassuf 4 JJ x
134 | VBN en faddpref 2 VBD x
135 | 


--------------------------------------------------------------------------------
/data/LEXICALRULEFILE-WSJ.txt:
--------------------------------------------------------------------------------
  1 | NN s fhassuf 1 NNS x
  2 | NN . fchar CD x
  3 | NN - fchar JJ x
  4 | NN ed fhassuf 2 VBN x
  5 | NN ing fhassuf 3 VBG x
  6 | ly hassuf 2 RB x
  7 | ly addsuf 2 JJ x
  8 | NN $ fgoodright CD x
  9 | NN al fhassuf 2 JJ x
 10 | NN would fgoodright VB x
 11 | NN 0 fchar CD x
 12 | NN be fgoodright JJ x
 13 | NNS us fhassuf 2 JJ x
 14 | NNS it fgoodright VBZ x
 15 | NN ble fhassuf 3 JJ x
 16 | NN ic fhassuf 2 JJ x
 17 | NN 1 fchar CD x
 18 | NNS ss fhassuf 2 NN x
 19 | un deletepref 2 JJ x
 20 | NN ive fhassuf 3 JJ x
 21 | NNP ed fhassuf 2 JJ x
 22 | NN n't fgoodright VB x
 23 | VB the fgoodright NN x
 24 | NNS he fgoodright VBZ x
 25 | VBN he fgoodright VBD x
 26 | NN are fgoodright JJ x
 27 | JJ was fgoodleft NN x
 28 | NN est fhassuf 3 JJS x
 29 | VBZ The fgoodright NNS x
 30 | NNP ts fhassuf 2 NNS x
 31 | NN 4 fchar CD x
 32 | NN ize fhassuf 3 VB x
 33 | .. hassuf 2 : x
 34 | ful hassuf 3 JJ x
 35 | NN ate fhassuf 3 VB x
 36 | NNP ing fhassuf 3 VBG x
 37 | VBG is fgoodleft NN x
 38 | NN less fhassuf 4 JJ x
 39 | NN ary fhassuf 3 JJ x
 40 | Co. goodleft NNP x
 41 | NN ant fhassuf 3 JJ x
 42 | million goodleft CD x
 43 | JJ their fgoodleft IN x
 44 | NN he fgoodright VBD x
 45 | Mr. goodright NNP x
 46 | JJ of fgoodleft NN x
 47 | NN so fgoodright JJ x
 48 | NN y fdeletesuf 1 JJ x
 49 | VBN which fgoodright VBD x
 50 | VBD been fgoodright VBN x
 51 | VB a fgoodright NN x
 52 | NN economic fgoodleft JJ x
 53 | 9 char CD x
 54 | CD t fchar JJ x
 55 | NN can fgoodright VB x
 56 | VB the fgoodright NN x
 57 | JJ S-T-A-R-T fgoodright VBN x
 58 | VBN - fchar JJ x
 59 | NN lar fhassuf 3 JJ x
 60 | NNP ans fhassuf 3 NNPS x
 61 | NN men fhassuf 3 NNS x
 62 | CD d fchar JJ x
 63 | JJ n fdeletesuf 1 VBN x
 64 | JJ 's fgoodleft NN x
 65 | NNS is fhassuf 2 NN x
 66 | ES hassuf 2 NNS x
 67 | JJ er fdeletesuf 2 JJR x
 68 | Inc. goodleft NNP x
 69 | NN 2 fchar CD x
 70 | VBD be fgoodleft MD x
 71 | ons hassuf 3 NNS x
 72 | RB - fchar JJ x
 73 | NN very fgoodright JJ x
 74 | ous hassuf 3 JJ x
 75 | NN a fdeletepref 1 RB x
 76 | NNP people fgoodleft JJ x
 77 | VB have fgoodleft RB x
 78 | NNS It fgoodright VBZ x
 79 | NN id fhassuf 2 JJ x
 80 | JJ may fgoodleft NN x
 81 | VBN but fgoodright VBD x
 82 | RS hassuf 2 NNS x
 83 | JJ stry fhassuf 4 NN x
 84 | NNS them fgoodleft VBZ x
 85 | VBZ were fgoodleft NNS x
 86 | NN ing faddsuf 3 VB x
 87 | JJ s faddsuf 1 NN x
 88 | NN 7 fchar CD x
 89 | NN d faddsuf 1 VB x
 90 | VB but fgoodleft NN x
 91 | NN 3 fchar CD x
 92 | NN est faddsuf 3 JJ x
 93 | NN en fhassuf 2 VBN x
 94 | NN costs fgoodright IN x
 95 | NN 8 fchar CD x
 96 | VB b fhaspref 1 NN x
 97 | zes hassuf 3 VBZ x
 98 | VBN s faddsuf 1 NN x
 99 | some hassuf 4 JJ x
100 | NN ic fhassuf 2 JJ x
101 | ly addsuf 2 JJ x
102 | ness addsuf 4 JJ x
103 | JJS s faddsuf 1 NN x
104 | NN ier fhassuf 3 JJR x
105 | NN ky fhassuf 2 JJ x
106 | tyle hassuf 4 JJ x
107 | NNS ates fhassuf 4 VBZ x
108 | fy hassuf 2 VB x
109 | body addsuf 4 DT x
110 | NN ways fgoodleft JJ x
111 | NNP ies fhassuf 3 NNPS x
112 | VB negative fgoodright NN x
113 | ders hassuf 4 NNS x
114 | ds hassuf 2 NNS x
115 | -day addsuf 4 CD x
116 | nian hassuf 4 JJ x
117 | JJR s faddsuf 1 NN x
118 | ppy hassuf 3 JJ x
119 | NN ish fhassuf 3 JJ x
120 | tors hassuf 4 NNS x
121 | oses hassuf 4 VBZ x
122 | NNS oves fhassuf 4 VBZ x
123 | VBN un fhaspref 2 JJ x
124 | lent hassuf 4 JJ x
125 | NN ward fdeletesuf 4 RB x
126 | VB k fchar NN x
127 | VB r fhassuf 1 NN x
128 | VB e fdeletesuf 1 NN x
129 | NNS Engelken fgoodright VBZ x
130 | NN ient fhassuf 4 JJ x
131 | ED hassuf 2 VBD x
132 | VBG B fchar NNP x
133 | VB le fhassuf 2 NN x
134 | ment addsuf 4 VB x
135 | ING hassuf 3 NN x
136 | JJ ery fhassuf 3 NN x
137 | JJ tus fhassuf 3 NN x
138 | JJ car fhassuf 3 NN x
139 | NN 6 fchar CD x
140 | NNS 0 fchar CD x
141 | JJ ing fdeletesuf 3 VBG x
142 | here hassuf 4 RB x
143 | VBN scr fhaspref 3 VBD x
144 | uces hassuf 4 VBZ x
145 | fies hassuf 4 VBZ x
146 | self deletesuf 4 PRP x
147 | NNP $ fchar $ x
148 | VBN wa fhaspref 2 VBD x
149 | 


--------------------------------------------------------------------------------
/data/NBEST-RULES.txt:
--------------------------------------------------------------------------------
  1 | VBN JJ PREV1OR2TAG DT
  2 | VBG NN PREV1OR2TAG POS
  3 | VBN JJ PREVTAG IN
  4 | VBN JJ NEXTTAG NN
  5 | RB IN NEXT2TAG CD
  6 | JJ NN PREV2WD president
  7 | VBG NN PREVWD the
  8 | NN JJ CURRENTWD average
  9 | NNP NNPS CURRENTWD Securities
 10 | VBN VBD PREVBIGRAM DT NN
 11 | IN RB PREVBIGRAM CD ,
 12 | RB IN NEXTTAG DT
 13 | NNP JJ NEXTTAG JJ
 14 | RP RB WDPREVTAG VBD up
 15 | VBN VBD PREVBIGRAM NNP ,
 16 | NN NNP PREV1OR2TAG NNP
 17 | RP IN NEXT1OR2OR3TAG TO
 18 | VBN VBD PREVBIGRAM JJ NN
 19 | JJ RB CURRENTWD long
 20 | VBN JJ NEXTTAG NNS
 21 | RP IN CURRENTWD off
 22 | VBG NN NEXT1OR2OR3TAG VBD
 23 | IN RB NEXTBIGRAM . STAART
 24 | NN JJ NEXT2WD of
 25 | IN RB WDNEXTTAG about CD
 26 | NN JJ PREV1OR2OR3TAG .
 27 | NN NNS PREVTAG CD
 28 | VBN VBD NEXTTAG PRP
 29 | NNP NNPS NEXTWD Corp.
 30 | JJ RB WDNEXTTAG much IN
 31 | NNS VBZ PREVBIGRAM DT NN
 32 | RB RP PREVTAG VB
 33 | NNPS NNP PREV1OR2OR3TAG NNP
 34 | VBP NN PREVBIGRAM DT NNS
 35 | JJR RBR NEXTBIGRAM IN DT
 36 | VBD VBN CURRENTWD called
 37 | NN JJ NEXTBIGRAM JJ NN
 38 | RBR RB SURROUNDTAG NN .
 39 | VBN VBD PREVTAG CC
 40 | VBN VBD SURROUNDTAG RB DT
 41 | NN JJ SURROUNDTAG DT NN
 42 | JJ PDT NEXT1OR2TAG DT
 43 | VBN VBD NEXT1OR2OR3TAG JJR
 44 | VBG JJ SURROUNDTAG DT NN
 45 | RP RB NEXTTAG IN
 46 | JJ NN CURRENTWD chief
 47 | IN RP NEXTTAG .
 48 | JJ RB CURRENTWD much
 49 | JJ NN NEXTWD and
 50 | NN JJ SURROUNDTAG IN NNS
 51 | JJR RBR NEXT1OR2OR3TAG JJ
 52 | VBN VBD NEXTTAG CD
 53 | VB JJ NEXTTAG NNS
 54 | VBG JJ NEXTTAG NNS
 55 | RP IN PREV1OR2OR3TAG VB
 56 | IN RP CURRENTWD out
 57 | VBZ NNS PREV1OR2TAG IN
 58 | JJ NNP PREVTAG NNP
 59 | NNP JJ NEXTTAG NNS
 60 | VBG NN PREVTAG NN
 61 | JJ NN CURRENTWD executive
 62 | JJ NN NEXT2TAG DT
 63 | NN VBG PREV1OR2OR3TAG VBP
 64 | VBD VBN NEXTWD in
 65 | VBN VBD PREV2TAG JJ
 66 | IN RB NEXT1OR2WD from
 67 | IN WDT NEXT1OR2TAG RB
 68 | RB IN CURRENTWD down
 69 | NNP JJ SURROUNDTAG DT NN
 70 | NNP NNPS NEXTWD Inc.
 71 | VBN VBD PREVTAG NN
 72 | VBN VBD PREVTAG NNS
 73 | JJ RB NEXTBIGRAM . STAART
 74 | JJ NN PREV2WD the
 75 | PRP$ PRP CURRENTWD her
 76 | RP IN CURRENTWD up
 77 | CD NN PREV1OR2TAG DT
 78 | NNS JJ NEXTTAG NN
 79 | IN RB WDNEXTTAG as JJ
 80 | NN VB NEXTTAG DT
 81 | NNP JJ NEXTTAG NN
 82 | JJR RBR RBIGRAM more than
 83 | NN VBG PREV1OR2TAG NNS
 84 | VBP VB PREVTAG RB
 85 | VBG NN NEXTTAG NN
 86 | RB IN CURRENTWD ago
 87 | JJ NN NEXT2TAG NNP
 88 | NN NNP SURROUNDTAG STAART NNS
 89 | NNP NN PREV1OR2OR3TAG ``
 90 | JJ RB NEXT1OR2TAG DT
 91 | RP RB CURRENTWD up
 92 | JJ NN NEXTTAG ,
 93 | NN VB PREVWD and
 94 | NNP NNPS PREVBIGRAM DT NNP
 95 | NNS NNP PREV1OR2OR3TAG NNP
 96 | NN JJ NEXTTAG NNS
 97 | VBG NN NEXTTAG NNS
 98 | NNS VBZ PREV2TAG NNP
 99 | JJ NN PREVWD and
100 | NN JJ NEXT2TAG ,
101 | VBD VBN NEXTTAG NN
102 | NNP NNPS SURROUNDTAG NNP NNP
103 | RB RP CURRENTWD down
104 | VB NN NEXT1OR2TAG ,
105 | VBZ NNS PREVTAG NN
106 | VBZ NNS NEXTTAG IN
107 | VBN JJ PREV1OR2OR3TAG PRP
108 | NN VBG PREVTAG IN
109 | JJ NN NEXT1OR2OR3TAG VBZ
110 | VBD VBN PREV1OR2TAG RB
111 | VBD VBN PREV1OR2OR3TAG VBP
112 | DT IN WDNEXTTAG that NN
113 | VBN VBD PREVTAG ,
114 | NNS CD PREV1OR2WD the
115 | RBR JJR NEXTTAG JJ
116 | JJR RBR PREV1OR2OR3TAG NN
117 | VBN JJ NEXT1OR2TAG JJ
118 | JJ IN CURRENTWD next
119 | JJ NN NEXT1OR2TAG VBD
120 | VBD VBN PREVTAG CC
121 | JJ RB NEXTTAG IN
122 | IN RB NEXTTAG RB
123 | NN JJ NEXTTAG NN
124 | NN JJ NEXT1OR2TAG JJ
125 | VB VBP PREV1OR2WD and
126 | VBN JJ PREVTAG RB
127 | NNS VBZ PREVTAG NNP
128 | NNP NN PREVTAG STAART
129 | NN VBG NEXTTAG NN
130 | JJ NN PREVTAG JJ
131 | IN RB CURRENTWD out
132 | IN RP CURRENTWD over
133 | IN WDT PREVTAG NN
134 | RBR JJR ALWAYS DUMMY
135 | IN DT NEXT1OR2TAG JJ
136 | NNS VBZ PREV2TAG NN
137 | NNS NNPS PREV1OR2TAG STAART
138 | VB JJ NEXT1OR2TAG NN
139 | NNP NNS NEXT1OR2OR3TAG VB
140 | JJ NN PREV1OR2OR3TAG POS
141 | VBP VB PREV1OR2OR3TAG NNP
142 | NN VBG NEXTTAG NNS
143 | IN RB CURRENTWD about
144 | VBG JJ NEXT1OR2TAG NN
145 | JJ NN NEXT1OR2TAG ,
146 | JJ NN NEXT2WD .
147 | VBN VBD PREVTAG RB
148 | JJ NNP PREV1OR2OR3TAG .
149 | VB VBP PREV1OR2TAG ,
150 | JJS RBS ALWAYS DUMMY
151 | VBZ NNS NEXTTAG DT
152 | JJ RB NEXTTAG JJ
153 | JJR RBR ALWAYS DUMMY
154 | IN RB NEXTTAG IN
155 | NNP NNPS NEXT1OR2WD and
156 | JJ RB NEXT1OR2TAG TO
157 | VBG NN PREV1OR2TAG IN
158 | JJ NN NEXT1OR2OR3TAG DT
159 | NNP NNPS PREVBIGRAM NNP NNP
160 | VBD VBN NEXTTAG IN
161 | IN DT NEXT1OR2OR3TAG NNP
162 | NNP NNS PREV1OR2OR3TAG .
163 | NNP NNPS NEXT1OR2TAG IN
164 | VBP VB NEXT1OR2OR3TAG JJ
165 | VB VBD ALWAYS DUMMY
166 | NNS VBZ PREV2TAG JJ
167 | VBG NN ALWAYS DUMMY
168 | NNP JJ SURROUNDTAG DT NNP
169 | NN JJ PREV1OR2TAG VBD
170 | NNS VBZ PREV1OR2OR3TAG ,
171 | VBG NNP ALWAYS DUMMY
172 | VBN JJ NEXT1OR2TAG IN
173 | VBP VB PREV1OR2OR3TAG PRP
174 | JJ NN PREV1OR2OR3TAG NN
175 | JJ RB PREV1OR2OR3TAG STAART
176 | VBN NN ALWAYS DUMMY
177 | VBD VBN NEXTTAG TO
178 | NNP JJ NEXT1OR2OR3TAG NNS
179 | WDT DT ALWAYS DUMMY
180 | VB NN PREVBIGRAM NN TO
181 | RB JJ PREV1OR2OR3TAG DT
182 | NNP NNPS PREV2TAG IN
183 | VB VBP PREV1OR2TAG NNP
184 | VBN VBD PREV1OR2OR3TAG IN
185 | RB IN ALWAYS DUMMY
186 | RB RBR ALWAYS DUMMY
187 | IN RB PREVTAG ,
188 | NN JJ PREVTAG DT
189 | VBP NN ALWAYS DUMMY
190 | NNP NN NEXT1OR2TAG CD
191 | NNP NN NEXT2TAG IN
192 | NNP NN PREVTAG DT
193 | NNP NN NEXTBIGRAM NNP NNP
194 | VB NN NEXT1OR2TAG IN
195 | IN RP PREVTAG VB
196 | VBG JJ ALWAYS DUMMY
197 | VBN JJ ALWAYS DUMMY
198 | JJ NNP PREV1OR2TAG IN
199 | JJ NNP PREV1OR2TAG DT
200 | NNP NNPS PREVTAG NNP
201 | JJ NN PREV1OR2OR3TAG DT
202 | RB JJ NEXT1OR2OR3TAG IN
203 | VBD VBN NEXT1OR2TAG JJ
204 | NN VBG NEXT1OR2OR3TAG IN
205 | VBN VBD NEXT1OR2OR3TAG DT
206 | NNS NN ALWAYS DUMMY
207 | NN JJ NEXT1OR2OR3TAG NN
208 | NNP JJ NEXTTAG NNP
209 | JJ RB NEXT1OR2TAG IN
210 | NN VBG ALWAYS DUMMY
211 | VBN VB ALWAYS DUMMY
212 | JJ VBN ALWAYS DUMMY
213 | VBZ NNS ALWAYS DUMMY
214 | JJ NN ALWAYS DUMMY
215 | NN NNP ALWAYS DUMMY
216 | DT RB ALWAYS DUMMY
217 | NN JJ PREV1OR2OR3TAG IN
218 | VBP VB ALWAYS DUMMY
219 | NNP NN NEXTTAG NNP
220 | NNP NNPS PREV1OR2OR3TAG IN
221 | RB JJ ALWAYS DUMMY
222 | NNS NNPS ALWAYS DUMMY
223 | VBD VBN PREV1OR2TAG NN
224 | NNS NNP ALWAYS DUMMY
225 | NN NNS ALWAYS DUMMY
226 | JJ RB ALWAYS DUMMY
227 | NN JJ ALWAYS DUMMY
228 | VBD JJ ALWAYS DUMMY
229 | VB NN ALWAYS DUMMY
230 | JJ VBG ALWAYS DUMMY
231 | NN RB ALWAYS DUMMY
232 | VBN VBD ALWAYS DUMMY
233 | NNP NNPS ALWAYS DUMMY
234 | NN VBD ALWAYS DUMMY
235 | NNP NN PREV1OR2OR3TAG IN
236 | IN JJ ALWAYS DUMMY
237 | NN FW ALWAYS DUMMY
238 | VBD VBN ALWAYS DUMMY
239 | NN VBP NEXT1OR2OR3TAG NN
240 | NNS VBZ ALWAYS DUMMY
241 | NNP NN ALWAYS DUMMY
242 | JJ VB ALWAYS DUMMY
243 | NN VBN ALWAYS DUMMY
244 | NNP NNS ALWAYS DUMMY
245 | POS VBZ ALWAYS DUMMY
246 | VB VBP ALWAYS DUMMY
247 | NN VBP ALWAYS DUMMY
248 | NN VB PREV1OR2OR3TAG NN
249 | NNP JJ ALWAYS DUMMY
250 | NNP VBZ ALWAYS DUMMY
251 | NN VB ALWAYS DUMMY
252 | IN RB ALWAYS DUMMY
253 | IN RP ALWAYS DUMMY
254 | DT NNP ALWAYS DUMMY
255 | 
256 | 


--------------------------------------------------------------------------------
/data/TESTRULEFILE-IAN.txt:
--------------------------------------------------------------------------------
 1 | NN s fhassuf 1 NNS x
 2 | NN . fchar CD x
 3 | NN - fchar JJ x
 4 | NN ed fhassuf 2 VBN x
 5 | NN ing fhassuf 3 VBG x
 6 | ly hassuf 2 RB x
 7 | NN al fhassuf 2 JJ x
 8 | NN 0 fchar CD x
 9 | NN be fgoodright JJ x
10 | NNS us fhassuf 2 JJ x
11 | NNS it fgoodright VBZ x
12 | NN ble fhassuf 3 JJ x
13 | NN ic fhassuf 2 JJ x
14 | NN 1 fchar CD x
15 | NNS ss fhassuf 2 NN x
16 | NN ive fhassuf 3 JJ x
17 | NNP ed fhassuf 2 JJ x
18 | NN est fhassuf 3 JJS x
19 | NNP ts fhassuf 2 NNS x
20 | NN 4 fchar CD x
21 | NN ize fhassuf 3 VB x
22 | .. hassuf 2 : x
23 | ful hassuf 3 JJ x
24 | NN ate fhassuf 3 VB x
25 | NNP ing fhassuf 3 VBG x
26 | NN less fhassuf 4 JJ x
27 | NN ary fhassuf 3 JJ x
28 | NN ant fhassuf 3 JJ x
29 | 9 char CD x
30 | CD t fchar JJ x
31 | VBN - fchar JJ x
32 | NN lar fhassuf 3 JJ x
33 | NNP ans fhassuf 3 NNPS x
34 | NN men fhassuf 3 NNS x
35 | CD d fchar JJ x
36 | NNS is fhassuf 2 NN x
37 | ES hassuf 2 NNS x
38 | NN 2 fchar CD x
39 | ons hassuf 3 NNS x
40 | RB - fchar JJ x
41 | ous hassuf 3 JJ x
42 | NN id fhassuf 2 JJ x
43 | RS hassuf 2 NNS x
44 | JJ stry fhassuf 4 NN x
45 | NN 7 fchar CD x
46 | NN 3 fchar CD x
47 | NN en fhassuf 2 VBN x
48 | NN 8 fchar CD x
49 | VB b fhaspref 1 NN x
50 | zes hassuf 3 VBZ x
51 | some hassuf 4 JJ x
52 | NN ic fhassuf 2 JJ x
53 | NN ier fhassuf 3 JJR x
54 | NN ky fhassuf 2 JJ x
55 | tyle hassuf 4 JJ x
56 | NNS ates fhassuf 4 VBZ x
57 | fy hassuf 2 VB x
58 | NN ways fgoodleft JJ x
59 | NNP ies fhassuf 3 NNPS x
60 | VB negative fgoodright NN x
61 | ders hassuf 4 NNS x
62 | ds hassuf 2 NNS x
63 | nian hassuf 4 JJ x
64 | ppy hassuf 3 JJ x
65 | NN ish fhassuf 3 JJ x
66 | tors hassuf 4 NNS x
67 | oses hassuf 4 VBZ x
68 | NNS oves fhassuf 4 VBZ x
69 | VBN un fhaspref 2 JJ x
70 | lent hassuf 4 JJ x
71 | VB k fchar NN x
72 | VB r fhassuf 1 NN x
73 | NNS Engelken fgoodright VBZ x
74 | NN ient fhassuf 4 JJ x
75 | ED hassuf 2 VBD x
76 | VBG B fchar NNP x
77 | VB le fhassuf 2 NN x
78 | ING hassuf 3 NN x
79 | JJ ery fhassuf 3 NN x
80 | JJ tus fhassuf 3 NN x
81 | JJ car fhassuf 3 NN x
82 | NN 6 fchar CD x
83 | NNS 0 fchar CD x
84 | here hassuf 4 RB x
85 | VBN scr fhaspref 3 VBD x
86 | uces hassuf 4 VBZ x
87 | fies hassuf 4 VBZ x
88 | NNP $ fchar $ x
89 | VBN wa fhaspref 2 VBD x
90 | 


--------------------------------------------------------------------------------
/data/concise-stopwords.txt:
--------------------------------------------------------------------------------
 1 | a
 2 | an
 3 | the
 4 | i
 5 | you
 6 | he
 7 | she
 8 | it
 9 | we
10 | they
11 | be
12 | am
13 | is
14 | are
15 | was
16 | were
17 | 


--------------------------------------------------------------------------------
/data/stem-dict.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslick/cl-langutils/38beec7a82eeb35b0bfb0824a41d13ed94fc648b/data/stem-dict.txt


--------------------------------------------------------------------------------
/data/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | above
  5 | according
  6 | accordingly
  7 | across
  8 | actually
  9 | after
 10 | afterwards
 11 | again
 12 | against
 13 | all
 14 | allow
 15 | allowed
 16 | allowing
 17 | allows
 18 | almost
 19 | alone
 20 | along
 21 | already
 22 | also
 23 | although
 24 | always
 25 | am
 26 | among
 27 | amongst
 28 | an
 29 | and
 30 | another
 31 | anybody
 32 | anyone
 33 | anyways
 34 | because
 35 | beside
 36 | both
 37 | comes
 38 | eg
 39 | et
 40 | etc
 41 | for
 42 | from
 43 | her
 44 | hers
 45 | herself
 46 | him
 47 | himself
 48 | howbeit
 49 | ibid
 50 | if
 51 | in
 52 | inasmuch
 53 | into
 54 | less
 55 | it
 56 | its
 57 | itself
 58 | ltd
 59 | me
 60 | more
 61 | my
 62 | myself
 63 | no
 64 | not
 65 | noone
 66 | nor
 67 | of
 68 | oh
 69 | on
 70 | onto
 71 | or
 72 | our
 73 | ours
 74 | ourselves
 75 | out
 76 | per
 77 | she
 78 | since
 79 | thanx
 80 | that
 81 | thats
 82 | the
 83 | their
 84 | theirs
 85 | them
 86 | themselves
 87 | theres
 88 | these
 89 | they
 90 | this
 91 | those
 92 | thru
 93 | to
 94 | toward
 95 | towards
 96 | unless
 97 | until
 98 | unto
 99 | up
100 | upon
101 | ve
102 | via
103 | viz
104 | we
105 | what
106 | whenever
107 | where
108 | whereafter
109 | whereas
110 | wherever
111 | whether
112 | which
113 | whither
114 | who
115 | whoever
116 | whom
117 | whomever
118 | whose
119 | with
120 | without
121 | you
122 | your
123 | yours
124 | yourself
125 | yourselves
126 | a
127 | able
128 | about
129 | above
130 | according
131 | accordingly
132 | across
133 | actually
134 | after
135 | afterwards
136 | again
137 | against
138 | all
139 | allow
140 | allowed
141 | allowing
142 | allows
143 | almost
144 | alone
145 | along
146 | already
147 | also
148 | although
149 | always
150 | am
151 | among
152 | amongst
153 | an
154 | and
155 | another
156 | any
157 | anybody
158 | anyhow
159 | anyone
160 | anything
161 | anyway
162 | anyways
163 | anywhere
164 | apart
165 | appear
166 | appeared
167 | appearing
168 | appears
169 | appreciates
170 | appreciating
171 | appropriate
172 | are
173 | around
174 | as
175 | aside
176 | ask
177 | asked
178 | asks
179 | associated
180 | associating
181 | at
182 | available
183 | away
184 | awfully
185 | be
186 | became
187 | because
188 | become
189 | becomes
190 | becoming
191 | been
192 | before
193 | beforehand
194 | behind
195 | believe
196 | believed
197 | believes
198 | below
199 | beside
200 | besides
201 | best
202 | better
203 | between
204 | beyond
205 | both
206 | but
207 | by
208 | bye
209 | came
210 | can
211 | cannot
212 | cant
213 | certainly
214 | change
215 | changed
216 | changes
217 | changing
218 | clearly
219 | com
220 | come
221 | comes
222 | coming
223 | concerning
224 | consequently
225 | consider
226 | considered
227 | considering
228 | considers
229 | contain
230 | contained
231 | containing
232 | contains
233 | corresponding
234 | could
235 | currently
236 | de
237 | definitely
238 | describe
239 | described
240 | describes
241 | describing
242 | despite
243 | did
244 | didn
245 | different
246 | do
247 | does
248 | doing
249 | don
250 | done
251 | down
252 | downward
253 | downwards
254 | during
255 | each
256 | edu
257 | eg
258 | either
259 | else
260 | elsewhere
261 | enough
262 | entirely
263 | especially
264 | et
265 | etc
266 | even
267 | ever
268 | every
269 | everybody
270 | everyone
271 | everything
272 | everywhere
273 | ex
274 | exactly
275 | example
276 | except
277 | f
278 | far
279 | few
280 | finally
281 | follow
282 | followed
283 | following
284 | follows
285 | for
286 | former
287 | formerly
288 | forth
289 | from
290 | further
291 | furthermore
292 | g
293 | gave
294 | get
295 | gets
296 | getting
297 | give
298 | given
299 | gives
300 | giving
301 | go
302 | goes
303 | going
304 | gone
305 | good
306 | got
307 | gotten
308 | greetings
309 | h
310 | had
311 | half
312 | happen
313 | happened
314 | happening
315 | happens
316 | hardly
317 | has
318 | have
319 | having
320 | he
321 | hence
322 | her
323 | here
324 | hereafter
325 | hereby
326 | herein
327 | hereupon
328 | hers
329 | herself
330 | hi
331 | him
332 | himself
333 | his
334 | hither
335 | hopefully
336 | how
337 | howbeit
338 | however
339 | i
340 | ibid
341 | ie
342 | if
343 | ignore
344 | ignored
345 | ignores
346 | ignoring
347 | immediate
348 | in
349 | inasmuch
350 | indeed
351 | indicate
352 | indicated
353 | indicates
354 | indicating
355 | inner
356 | insofar
357 | instead
358 | into
359 | inward
360 | is
361 | it
362 | its
363 | itself
364 | j
365 | just
366 | k
367 | kay
368 | keep
369 | keeping
370 | keeps
371 | kept
372 | knew
373 | know
374 | knowing
375 | known
376 | knows
377 | l
378 | last
379 | lately
380 | later
381 | latter
382 | latterly
383 | least
384 | less
385 | lest
386 | lets
387 | letting
388 | like
389 | liked
390 | likely
391 | likes
392 | liking
393 | little
394 | ll
395 | look
396 | looked
397 | looking
398 | looks
399 | ltd
400 | m
401 | mainly
402 | many
403 | may
404 | maybe
405 | me
406 | means
407 | meanwhile
408 | merely
409 | might
410 | mine
411 | more
412 | moreover
413 | most
414 | mostly
415 | much
416 | must
417 | my
418 | myself
419 | n
420 | name
421 | namely
422 | nd
423 | near
424 | nearly
425 | necessary
426 | need
427 | needed
428 | needing
429 | needs
430 | neither
431 | never
432 | nevertheless
433 | next
434 | no
435 | nobody
436 | non
437 | none
438 | noone
439 | nor
440 | normally
441 | not
442 | nothing
443 | now
444 | nowhere
445 | o
446 | obviously
447 | of
448 | off
449 | often
450 | oh
451 | ok
452 | okay
453 | on
454 | once
455 | one
456 | ones
457 | only
458 | onto
459 | or
460 | other
461 | others
462 | otherwise
463 | ought
464 | our
465 | ours
466 | ourselves
467 | out
468 | outside
469 | over
470 | overall
471 | own
472 | owned
473 | owning
474 | owns
475 | p
476 | particular
477 | particularly
478 | per
479 | perhaps
480 | place
481 | placed
482 | places
483 | placing
484 | please
485 | possible
486 | presumably
487 | probably
488 | provide
489 | provided
490 | provides
491 | providing
492 | q
493 | que
494 | quite
495 | qv
496 | r
497 | rather
498 | rd
499 | re
500 | really
501 | reasonably
502 | regarding
503 | regardless
504 | regards
505 | relatively
506 | respectively
507 | s
508 | said
509 | same
510 | saw
511 | say
512 | saying
513 | says
514 | secondly
515 | see
516 | seeing
517 | seem
518 | seemed
519 | seeming
520 | seems
521 | seen
522 | sees
523 | self
524 | selves
525 | send
526 | sending
527 | sends
528 | sent
529 | seriously
530 | several
531 | shall
532 | she
533 | should
534 | since
535 | so
536 | some
537 | somebody
538 | somehow
539 | someone
540 | something
541 | sometime
542 | sometimes
543 | somewhat
544 | somewhere
545 | soon
546 | sorry
547 | specified
548 | specifies
549 | specify
550 | specifying
551 | still
552 | such
553 | sup
554 | sure
555 | t
556 | take
557 | taken
558 | takes
559 | taking
560 | tell
561 | telling
562 | tells
563 | tend
564 | tended
565 | tending
566 | tends
567 | th
568 | than
569 | thank
570 | thanked
571 | thanking
572 | thanks
573 | thanx
574 | that
575 | thats
576 | the
577 | their
578 | theirs
579 | them
580 | themselves
581 | then
582 | thence
583 | there
584 | thereafter
585 | thereby
586 | therefore
587 | therein
588 | theres
589 | thereupon
590 | these
591 | they
592 | think
593 | thinking
594 | thinks
595 | this
596 | thorough
597 | thoroughly
598 | those
599 | though
600 | thought
601 | through
602 | throughout
603 | thru
604 | thus
605 | to
606 | together
607 | told
608 | too
609 | took
610 | toward
611 | towards
612 | tried
613 | tries
614 | truly
615 | try
616 | trying
617 | twice
618 | u
619 | un
620 | under
621 | unfortunately
622 | unless
623 | unlikely
624 | until
625 | unto
626 | up
627 | upon
628 | us
629 | use
630 | used
631 | useful
632 | uses
633 | using
634 | usually
635 | v
636 | various
637 | ve
638 | very
639 | via
640 | viz
641 | w
642 | want
643 | wanted
644 | wanting
645 | wants
646 | was
647 | way
648 | we
649 | welcome
650 | well
651 | went
652 | were
653 | what
654 | whatever
655 | when
656 | whence
657 | whenever
658 | where
659 | whereafter
660 | whereas
661 | whereby
662 | wherein
663 | whereupon
664 | wherever
665 | whether
666 | which
667 | while
668 | whither
669 | who
670 | whoever
671 | whole
672 | whom
673 | whomever
674 | whose
675 | why
676 | will
677 | willing
678 | wished
679 | with
680 | within
681 | without
682 | wonder
683 | wondered
684 | wondering
685 | would
686 | x
687 | y
688 | yeah
689 | yes
690 | yet
691 | you
692 | your
693 | yours
694 | yourself
695 | yourselves
696 | z
697 | 
698 | 


--------------------------------------------------------------------------------
/docs/LISP2005-langutils.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslick/cl-langutils/38beec7a82eeb35b0bfb0824a41d13ed94fc648b/docs/LISP2005-langutils.pdf


--------------------------------------------------------------------------------
/langutils.asd:
--------------------------------------------------------------------------------
 1 | ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
 2 | 
 3 | ;; Language utilities asd files
 4 | 
 5 | (defpackage #:langutils.system
 6 |   (:use #:cl #:asdf))
 7 | 
 8 | (in-package #:langutils.system)
 9 | 
10 | (defsystem #:langutils
11 |     :description "Language utilities"
12 |     :version "1.0"
13 |     :author "Ian Eslick"
14 |     :licence "BSD"
15 |     :depends-on (:s-xml-rpc :stdutils)
16 |     :components ((:module "src"
17 | 			  :components ((:file "package")
18 | 				       (:file "config")
19 | 				       (:file "tokens")
20 | 				       (:file "reference")
21 | 				       (:file "stopwords")
22 | 				       (:file "my-meta")
23 | 				       (:file "tokenize")
24 | 				       (:file "lexicon")
25 | 				       (:file "lemma")
26 | 				       (:file "porter")
27 | 				       (:file "contextual-rule-parser")
28 | 				       (:file "tagger-data")
29 | 				       (:file "tagger")
30 | 				       (:file "chunker-constants")
31 | 				       (:file "chunker")
32 | 				       (:file "concept")
33 | 				       (:file "init"))
34 | 			  :serial t))
35 |     :in-order-to ((load-op (compile-op :langutils))))
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/src/chunker-constants.lisp:
--------------------------------------------------------------------------------
 1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
 2 | ;;;; *************************************************************************
 3 | ;;;; FILE IDENTIFICATION
 4 | ;;;;
 5 | ;;;; Name:          chunker-constants
 6 | ;;;; Purpose:       Constant definitions that need to have compile time values 
 7 | ;;;;                prior to macro use in chunker.lisp
 8 | ;;;;
 9 | ;;;; Programmer:    Ian S. Eslick
10 | ;;;; Date Started:  November 2004
11 | ;;;;
12 | 
13 | (in-package :langutils)
14 | 
15 | (defconstant noun-pattern
16 |   (if (boundp 'noun-pattern)
17 |       (symbol-value 'noun-pattern)
18 |       '(OR 
19 | 	(AND 
20 | 	 (? OR PDT)
21 | 	 (OR DT PRP PRP$ WDT WP WP$)
22 | 	 (* OR VBG VBD VBN JJ JJR JJS \,) ;; CC)
23 | 	 ;;     (OR 
24 | 	 (+ OR NN NNS NNP NNPS CD))
25 | 	;;      (AND (+ OR NN NNS NNP NNPS CD)
26 | 	;;	   (* OR VBG VBD VBN JJ JJR JJS #\, CC NN NNS NNP NNPS CD)
27 | 	;;	   (+ OR NN NNS NNP NNPS CD))))
28 | 	(AND 
29 | 	 (? OR PDT)
30 | 	 ;;	(* OR JJ JJR JJS #\, CC NN NNS NNP NNPS CD)
31 | 	 ;;	(last-1 OR NN NNS NNP NNPS CD))))
32 | 	 (* OR JJ JJR JJS \,) ;; CC)
33 | 	 (+ OR NN NNS NNP NNPS CD))
34 | 	EX PRP WP WDT)))
35 | 
36 | (defconstant verb-pattern
37 |   (if (boundp 'verb-pattern)
38 |       (symbol-value 'verb-pattern)
39 |       '(and 
40 | 	(* or RB RBR RBS WRB)
41 | 	(? or MD)
42 | 	(* or RB RBR RBS WRB)
43 | 	(+ or VB VBD VBG VBP VBZ)
44 | 	(* or VB VBD VBG VBN VBP VBZ RB RBR RBS WRB)
45 | 	(? or RP))))
46 | ;;  (? and (* or RB) (or VB VBN) (? or RP))))
47 | 
48 | (defconstant adv-pattern
49 | ;;  '(or 
50 | ;;    (and 
51 | ;;     (* or RB RBR RBS)
52 | ;;     (+ or JJ JJR JJS)
53 | ;;     (* or RB RBR RBS JJ JJR JJS)
54 | ;;;     (+ or JJ JJR JJS))
55 |   (if (boundp 'adv-pattern)
56 |       (symbol-value 'adv-pattern)
57 |       '(and 
58 | 	(* or RB RBR RBS)
59 | 	(+ or JJ JJR JJS))))
60 | 
61 | (defconstant p-pattern
62 |   (if (boundp 'p-pattern)
63 |       (symbol-value 'p-pattern)
64 |       '(and (+ or IN))))
65 | 	


--------------------------------------------------------------------------------
/src/chunker.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          chunker
  6 | ;;;; Purpose:       A regex verb and noun phrase chunker using the 
  7 | ;;;;                array matching utility infrastructure
  8 | ;;;;
  9 | ;;;; Programmer:    Ian S. Eslick
 10 | ;;;; Date Started:  October 2004
 11 | ;;;;
 12 | 
 13 | (in-package :langutils)
 14 | 
 15 | ;; ===============================
 16 | ;; STRING CHUNKING TOP LEVEL
 17 | ;; Speed: slow (extra conversion)
 18 | ;; Input size: < 100k strings
 19 | 
 20 | (defun chunk (text)
 21 |   "Returns a phrase-list for the provided text"
 22 |   (get-basic-chunks (vector-tag text)))
 23 | 
 24 | (defun chunk-tokenized (text)
 25 |   "Returns a phrase-list for the provided tokenized string"
 26 |   (get-basic-chunks (vector-tag-tokenized text)))
 27 | 
 28 | ;; ===============================
 29 | ;; VECTOR-DOCUMENT INTERFACE
 30 | ;; Speed: optimal
 31 | ;; Input size: unlimited
 32 | 
 33 | (defmethod get-basic-chunks ((doc vector-document) &optional interval)
 34 |   "Returns a list of PHRASEs referencing 'doc' for
 35 |    all supported primitive phrase types"
 36 |   (let ((nxs (get-nx-chunks doc interval))
 37 | 	(vxs (get-vx-chunks doc interval))
 38 | 	(axs (get-adverb-chunks doc interval))
 39 |         (ps (get-pp-chunks doc interval)))
 40 |     (sort (append nxs vxs axs ps)
 41 | 	  #'< :key #'phrase-start)))
 42 | 
 43 | (defmethod get-imperative-chunks ((doc vector-document) &optional interval)
 44 |   (do-collect-vector-matches (start end #.(localize-expression
 45 | 					   (list :AND 
 46 | 						 verb-pattern
 47 | 						 (list :? :AND noun-pattern)
 48 | 						 p-pattern
 49 | 						 noun-pattern) :package 'keyword))
 50 | 			     ((if interval
 51 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
 52 | 				  (document-tags doc)))
 53 |        (make-instance 'phrase
 54 | 		      :type :imperative
 55 | 		      :document doc
 56 | 		      :start (if interval (+ start (car interval)) start)
 57 | 		      :end (if interval (+ end (car interval)) end))))
 58 | 
 59 | (defmethod get-event-chunks ((doc vector-document) &optional interval)
 60 |   "Return vx+nx (simple verb arg) phrase objects"
 61 |   (do-collect-vector-matches (start end #.(localize-expression (list :AND verb-pattern noun-pattern) :package 'keyword))
 62 | 			     ((if interval
 63 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
 64 | 				  (document-tags doc)))
 65 |        (write-log chunker "Found event: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
 66 |        (make-instance 'phrase
 67 | 		      :type :event
 68 | 		      :document doc
 69 | 		      :start (if interval (+ start (car interval)) start)
 70 | 		      :end (if interval (+ end (car interval)) end))))
 71 |   
 72 | (defmethod get-extended-event-chunks1 ((doc vector-document) &optional interval)
 73 |   "Return vx+nx+pp... objects"
 74 |   (do-collect-vector-matches (start end #.(localize-expression `(and ,verb-pattern ,noun-pattern ,p-pattern ,noun-pattern) :package 'keyword))
 75 | 			     ((if interval
 76 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
 77 | 				  (document-tags doc)))
 78 |        (write-log chunker "Found extended event: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
 79 |        (make-instance 'phrase
 80 | 		      :type :event
 81 | 		      :document doc
 82 | 		      :start (if interval (+ start (car interval)) start)
 83 | 		      :end (if interval (+ end (car interval)) end))))
 84 | 
 85 | (defmethod get-extended-event-chunks2 ((doc vector-document) &optional interval)
 86 |   "Return vx+nx+pp... objects"
 87 |   (do-collect-vector-matches (start end #.(localize-expression `(and ,verb-pattern ,noun-pattern ,p-pattern ,noun-pattern ,p-pattern ,noun-pattern) :package 'keyword))
 88 | 			     ((if interval
 89 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
 90 | 				  (document-tags doc)))
 91 |        (write-log chunker "Found extended event: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
 92 |        (make-instance 'phrase
 93 | 		      :type :event
 94 | 		      :document doc
 95 | 		      :start (if interval (+ start (car interval)) start)
 96 | 		      :end (if interval (+ end (car interval)) end))))
 97 | 
 98 | 
 99 | (defmethod get-nx-chunks ((doc vector-document) &optional interval)
100 |   "Return a list of all nx phrases"
101 |   (do-collect-vector-matches (start end #.(localize-expression noun-pattern :package 'keyword))
102 | 			     ((if interval
103 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
104 | 				  (document-tags doc)))
105 |        (write-log chunker "Found np: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
106 |        (make-instance 'phrase
107 | 		      :type :nx
108 | 		      :document doc
109 | 		      :start (if interval (+ start (car interval)) start)
110 | 		      :end (if interval (+ end (car interval)) end))))
111 | 
112 | (defmethod get-vx-chunks ((doc vector-document) &optional interval)
113 |   "Return a list of all primitive vx phrases - no arguments"
114 |   (do-collect-vector-matches (start end #.(localize-expression verb-pattern :package 'keyword)) 
115 | 			     ((if interval
116 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
117 | 				  (document-tags doc)))
118 |       (write-log chunker "Found vp: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
119 |       (make-instance 'phrase
120 | 		     :type :vx
121 | 		     :document doc
122 | 		      :start (if interval (+ start (car interval)) start)
123 | 		      :end (if interval (+ end (car interval)) end))))
124 | 
125 | (defmethod get-adverb-chunks ((doc vector-document) &optional interval)
126 |   "Return a list of all adverbial phrases"
127 |   (do-collect-vector-matches (start end #.(localize-expression adv-pattern :package 'keyword))
128 | 			     ((if interval
129 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
130 | 				  (document-tags doc)))
131 |       (write-log chunker "Found ap: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
132 |       (make-instance 'phrase
133 | 		     :type :advp
134 | 		     :document doc
135 | 		      :start (if interval (+ start (car interval)) start)
136 | 		      :end (if interval (+ end (car interval)) end))))
137 | 
138 | (defmethod get-p-chunks ((doc vector-document) &optional interval)
139 |   "Return a list of all prepositions as phrases"
140 |   (do-collect-vector-matches (start end #.(localize-expression p-pattern :package 'keyword))
141 | 			     ((if interval
142 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
143 | 				  (document-tags doc)))
144 |       (write-log chunker "Found prep: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
145 |       (make-instance 'phrase
146 | 		     :type :prep
147 | 		     :document doc
148 | 		      :start (if interval (+ start (car interval)) start)
149 | 		      :end (if interval (+ end (car interval)) end))))
150 | 
151 | (defmethod get-pp-chunks ((doc vector-document) &optional interval)
152 |   "Return a list of all prepositions as phrases"
153 |   (do-collect-vector-matches (start end #.(localize-expression (list :AND p-pattern noun-pattern) :package 'keyword))
154 | 			     ((if interval
155 | 				  (subseq (document-tags doc) (car interval) (cdr interval))
156 | 				  (document-tags doc)))
157 |       (write-log chunker "Found prep: ~A (~A : ~A)" (subseq (document-tags doc) start (1+ end)) start end)
158 |       (make-instance 'phrase
159 | 		     :type :pp
160 | 		     :document doc
161 | 		      :start (if interval (+ start (car interval)) start)
162 | 		      :end (if interval (+ end (car interval)) end))))
163 | 
164 | (defun head-verbs (phrases &key (filter-common t))
165 |   (collect (lambda (p) (head-verb p :filter-common filter-common))
166 | 	   phrases))
167 |   
168 | (defparameter *common-verbs* nil)
169 | 
170 | (defun ensure-common-verbs ()
171 |   (setf *common-verbs* 
172 | 	(mapcar #'id-for-token
173 | 		'("be" "have" "say"  "see" "ask" "tell" "reply"
174 | 		  "do" "let"  "find" "answer" "take"))))
175 | 
176 | (defun head-verb (phrase &key (filter-common t))
177 |   (ensure-common-verbs)
178 |   (unless (and filter-common
179 | 	       (member (get-lemma-for-id (get-token-id phrase 0)) *common-verbs*))
180 |     (make-phrase (make-array 1 :initial-element (get-token-id phrase 0))
181 | 		 (make-array 1 :initial-element (get-tag phrase 0))
182 | 		 :verb)))
183 | 
184 | (defun root-nouns (phrases)
185 |   (append (collect #'root-noun phrases)
186 | 	  phrases))
187 | 
188 | (defun root-noun (phrase)  
189 |   (when (> (phrase-length phrase) 1)
190 |     (let ((last (1- (phrase-length phrase))))
191 |       (make-phrase (make-array 1 :initial-element (get-token-id phrase last))
192 | 		   (make-array 1 :initial-element (get-tag phrase last))
193 | 		   :noun))))
194 | 		 
195 | 
196 | ;; ====================================
197 | ;; Simple tagging interactive function
198 | 
199 | (defun test-phrase (text)
200 |   "Prints all the phrases found in the text for simple
201 |    experimenting"
202 |   (let ((doc (vector-tag text)))
203 |     (format t "Tagged: ~A~%" (print-vector-document doc))
204 |     (mapcar #'print-phrase (get-basic-chunks doc))))
205 |     
206 | ;; =======================================
207 | ;; Experiment with higher order structure 
208 | 
209 | (defun all-vx+nx-phrases (phrases)
210 |   "Overly hairy function for finding all vx phrases that
211 |    are followed by nx.  Get event chunks is a better way 
212 |    to do this."
213 |   (declare (optimize speed (safety 1))
214 |            (type list phrases))
215 |   (let ((pairs nil))
216 |     (declare (type list pairs))
217 |     (labels ((following-noun (start phrases count)
218 | 			     (cond ((or (null phrases) (= count 2))
219 | 				    nil)
220 | 				   ((= start (phrase-start (car phrases)))
221 | 				    (car phrases))
222 | 				   (t (following-noun start (cdr phrases) (1+ count)))))
223 | 	     (rec (cp phrases)
224 | 		  (cond ((null phrases)
225 | 			 (nreverse pairs))
226 | 			((eq (phrase-type cp) :verb)
227 | 			 (aif (following-noun (1+ (phrase-end cp)) phrases 0)
228 | 			      (push (make-instance 'phrase
229 | 						   :type :event
230 | 						   :document (phrase-document cp)
231 | 						   :start (phrase-start cp)
232 | 						   :end (phrase-end it))
233 | 				    pairs))
234 | 			 (rec (car phrases) (cdr phrases)))
235 | 			(t (rec (car phrases) (cdr phrases))))))
236 |       (rec (car phrases) (cdr phrases)))))
237 | 
238 | 
239 | 


--------------------------------------------------------------------------------
/src/concept.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          concept
  6 | ;;;; Purpose:       Abstraction for concepts.  A "concept" contains an array of
  7 | ;;;;                tokens representing the words of that concept, ie "walk dog" or "brush teeth"
  8 | ;;;;
  9 | ;;;; Programmer:    Aaron M. Sokoloski (and Ian Eslick)
 10 | ;;;; Date Started:  January 2005
 11 | 
 12 | 
 13 | (in-package :langutils)
 14 | 
 15 | ;;
 16 | ;; Concept class definition
 17 | ;;
 18 | 
 19 | (defclass concept ()
 20 |   ((token-vector ;; vector of token ids for words
 21 |     :reader token-vector
 22 |     :initarg :token-vector
 23 |     #-mcl :type #-mcl (array fixnum)
 24 |     :documentation "Stores the representation of the concept as an array of token ids")))
 25 | 
 26 | (eval-when (:compile-toplevel :load-toplevel)
 27 |   (export 'token-vector))
 28 | 
 29 | (defmethod print-object ((cn concept) stream)
 30 |   (print-unreadable-object (cn stream :type t :identity nil)
 31 |     (prin1 (concept->string cn) stream)))
 32 | 
 33 | ;;
 34 | ;; Concept library so all instances of the same 'concept'
 35 | ;; are the same object (cheap 'eq comparisons)
 36 | ;;
 37 | 
 38 | (defvar *concept-vhash* nil)
 39 | 
 40 | (defmethod lookup-canonical-concept-instance ((ta array))
 41 |   "Take a token array and see if there is already a 
 42 |    concept instance"
 43 | ;;  (vechash-get ta *concept-vhash*))
 44 |   (stdutils.gds:get-value *concept-vhash* ta))
 45 | 
 46 | (defmethod clear-concept-cache ()
 47 |   (setf *concept-vhash* (make-instance 'stdutils.gds:vector-keyed-table)))
 48 | 
 49 | (defparameter *concept-store-scratch-array* (make-array 20 :element-type 'fixnum :adjustable t)
 50 |   "Allows us to lookup concepts from arrays without allocating lots of unnecessary data")
 51 |   
 52 | (defmethod lookup-canonical-concept-instance ((lf list))
 53 |   "List of fixnums to lookup a concept instance"
 54 | ;;  (vechash-get lf *concept-vhash*))
 55 |   (stdutils.gds:get-value *concept-vhash* (list->array lf)))
 56 | 
 57 | (defmethod register-new-concept-instance ((c concept))
 58 | ;;  (vechash-put (token-vector c) c *concept-vhash*)
 59 |   (setf (stdutils.gds:get-value *concept-vhash* (token-vector c)) c)
 60 |   c)
 61 | 
 62 | ;;
 63 | ;; Concept is basically a low-overhead phrase, so allow comparisons between them
 64 | ;;
 65 | 
 66 | (defmethod-exported conceptually-equal ((ph1 phrase) (ph2 phrase))
 67 |   (every #'eql (phrase-lemmas ph1) (phrase-lemmas ph2)))
 68 | 
 69 | (defmethod-exported conceptually-equal ((ph phrase) (cn concept))
 70 |   (every #'eql (phrase-lemmas ph) (concept->token-array cn)))
 71 | 
 72 | (defmethod-exported conceptually-equal ((cn concept) (ph phrase))
 73 |   (conceptually-equal ph cn))
 74 | 
 75 | (defmethod-exported conceptually-equal ((cn1 concept) (cn2 concept))
 76 |   (let ((ta1 (concept->token-array cn1))
 77 | 	(ta2 (concept->token-array cn2)))
 78 |     (and (eql (length ta1) (length ta2))
 79 | 	 (every #'eql ta1 ta2))))
 80 | 
 81 | (defmethod-exported concept-contains ((csuper concept) (csub concept))
 82 |   (search (concept->token-array csub) (concept->token-array csuper) :test #'eql))
 83 | 
 84 | (defmethod-exported concat-concepts (&rest concepts)
 85 |   (token-array->concept (apply #'concatenate 'array (mapcar #'concept->token-array concepts)) :lemmatized t))
 86 | 
 87 | ;;
 88 | ;; Getting at text of Concepts
 89 | ;;
 90 | 
 91 | (defmethod-exported concept->words ((cname concept))
 92 |   (loop for id across (token-vector cname)
 93 |     collecting
 94 |     (token-for-id id)
 95 |     into strings
 96 |     finally (return strings)))
 97 | 
 98 | (defmethod-exported concept->string ((cname concept))
 99 |   (list-to-delimited-string (concept->words cname)))
100 | 
101 | (defmethod-exported concept->token-array ((cname concept))
102 |   "Concepts are immutable, don't change them!"
103 |   (let ((copy (make-array (length (token-vector cname)) :element-type 'fixnum)))
104 |     (map-into copy #'identity (token-vector cname))))
105 | 
106 | 
107 | ;;
108 | ;; Concept creation and uniqification (concepts are non-volatile)
109 | ;; 
110 | 
111 | (defun-exported force-concept (c)
112 |   (etypecase c
113 |     (string (string->concept c))
114 |     (concept c)
115 |     (array (token-array->concept c))
116 |     (phrase (phrase->concept c))))
117 | 
118 | (defun-exported string->concept (s &key (lemmatized nil))
119 |   (words->concept (lex-string (string-downcase s)) :lemmatized lemmatized))
120 | 
121 | (defun-exported phrase->concept (p &key lemmatized)
122 |   "Create a canonical concept from an arbitrary phrase
123 |    by removing determiners and lemmatizing verbs."
124 |   ;; NOTE: Could do this faster, but what the heck...
125 |   (words->concept (phrase-words p) :lemmatized lemmatized))
126 | 
127 | (defun-exported words->concept (slist &key (lemmatized nil))
128 |   (labels ((ensure-tokens (list)
129 | 			     (if (not (integerp (car list)))
130 | 				 (mapcar #'id-for-token list)
131 | 			       list))
132 | 	   (ensure-lemmatized (list)
133 | 			      (if (not lemmatized)
134 | 				  (lemmatize list :strip-det t :noun t :last-only t)
135 | 				list)))
136 |     (let ((tlist (ensure-lemmatized (ensure-tokens slist))))
137 |       (when tlist
138 |         (aif-ret (lookup-canonical-concept-instance tlist)
139 |           (let ((token-vector (make-array (length tlist) :element-type 'fixnum :adjustable nil)))
140 |   	    (map-into token-vector #'identity tlist)
141 |  	    (token-array->concept token-vector :lemmatized t)))))))
142 | 
143 | (defun-exported token-array->concept (tokens &key (lemmatized nil))
144 |   (if lemmatized 
145 |       (ensure-concept tokens) 
146 |     (ensure-concept (lemmatize tokens :strip-det t :noun t :last-only t))))
147 | 
148 | (defun ensure-concept (tokens)
149 |   (aif-ret (lookup-canonical-concept-instance tokens)
150 | 	   (make-concept tokens)))
151 | 
152 | (defun-exported make-concept (ta)
153 |   (register-new-concept-instance (make-instance 'concept :token-vector ta)))
154 | 
155 | ;;
156 | ;; Random concept related utilities
157 | ;;
158 | 
159 | (defun-exported associate-concepts (phrases)
160 |   "Return the list of phrase/list/token-arrays as pairs with the 
161 |    first element being the original and the second being a
162 |    canonicalized concept instance"
163 |   (mapcar (lambda (phrase)
164 | 	    (cons phrase (phrase->concept phrase)))
165 | 	  phrases))
166 | 
167 | ;;
168 | ;; Some unit tests
169 | ;;
170 | 
171 | (defun test-concept-equality ()
172 |   (assert (eq (string->concept "a fast dog") (string->concept "fast dogs"))))
173 | 


--------------------------------------------------------------------------------
/src/config.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :langutils)
 2 | 
 3 | (defvar *default-lexicon-file* nil
 4 |   "Path to the lexicon file")
 5 | (defvar *default-stems-file* nil
 6 |   "Path to the word stems file")
 7 | (defvar *default-stopwords-file* nil
 8 |   "Path to a stopwords file")
 9 | (defvar *default-concise-stopwords-file* nil
10 |   "Path to a *very* small list of words. Mainly pronouns and determiners")
11 | (defvar *default-lexical-rule-file* nil
12 |   "Path to the brill lexical rule file")
13 | (defvar *default-contextual-rule-file* nil
14 |   "Path to the brill contextual rule file")
15 | (defvar *default-token-map-file* nil
16 |   "Path to the token map file")
17 | (defvar *auto-init* nil
18 |   "Whether to call initialize-langutils when the .fasl is loaded")
19 | (defvar *report-status* nil
20 |   "Where to print langutils messages; default to none")
21 | 
22 | 
23 | 
24 | (defparameter *config-paths*
25 |   '((:lexicon *default-lexicon-file*)
26 |     (:stems *default-stems-file*)
27 |     (:stopwords *default-stopwords-file*)
28 |     (:concise-stopwords *default-concise-stopwords-file*)
29 |     (:lexical-rules *default-lexical-rule-file*)
30 |     (:contextual-rules *default-contextual-rule-file*)
31 |     (:token-map *default-token-map-file*)))
32 | 
33 | (defmacro write-log (name msg &rest args)
34 |   (declare (ignore name))
35 |   `(format *report-status* ,msg ,@args))
36 | 
37 | (defun relative-pathname (path)
38 |   (when path
39 |     (asdf:system-relative-pathname :langutils path)))
40 | 
41 | (defun read-config ()
42 |   (with-open-file (file (relative-pathname "config.sexp"))
43 |     (mapc #'handle-config-entry (read file))))
44 | 
45 | (defun handle-config-entry (entry)
46 |   (destructuring-bind (option pathtype &optional path) entry
47 |     (awhen (assoc option *config-paths*)
48 |       (setf (symbol-value (second it))
49 | 	    (if (eq pathtype :relative)
50 | 		(relative-pathname path)
51 | 		path)))
52 |     (case option
53 |       (:auto-init (setf *auto-init* pathtype))
54 |       (:report-status (setf *report-status* pathtype)))))
55 | 
56 | (eval-when (:load-toplevel)
57 |   ;; Read config
58 |   (read-config))
59 |     
60 | 


--------------------------------------------------------------------------------
/src/contextual-rule-parser.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          contextual-rule-parser
  6 | ;;;; Purpose:       Macro for generating the brill rule parser, used only in tagger-data.lisp
  7 | ;;;;
  8 | ;;;; Programmer:    Ian S. Eslick
  9 | ;;;; Date Started:  October 2004
 10 | ;;;;
 11 | 
 12 | (in-package :langutils)
 13 | 
 14 | ;;;
 15 | ;;; Macro system to build parser/generator (make-contextual-rule)
 16 | ;;; 
 17 | 
 18 | (defmacro def-contextual-rule-parser (name &body template-list)
 19 |   "Given a list of structures, defines a generator named 'name' that takes 
 20 |    a Brill contextual rule list (list of strings) and generates an applicable 
 21 |    closure. The closure accepts an argument list of (tokens tags offset) and will 
 22 |    apply the rule and related side effect to the two arrays at the provided
 23 |    offset.  Patterns are to be given in the form:
 24 |    (\"SURROUNDTAG\" (match (0 oldtag) (-1 tag1) (+1 tag2)) => 
 25 |                     (setf oldtag newtag))"
 26 |   `(defun ,name (pattern)
 27 |      (declare (optimize (speed 3) (safety 0) (debug 0)))
 28 | ;;	      (inline svref aref))
 29 |      (let ((name (string-upcase (third pattern))))
 30 |        (cond ,@(mapcar #'gen-rule-closure template-list)
 31 | 	     (t (write-log tagger-contextual "Unrecognized rule: ~A" (first pattern)))))))
 32 | 
 33 | (defun gen-rule-closure (template)
 34 |   "Generate the code for the rule closure as one of the cond 
 35 |    forms matching the name of the closure pattern to the 
 36 |    rule pattern"
 37 |   (let ((rule-name (first template))
 38 | 	(match-pattern (second template))
 39 | 	(newtok-name (fourth template)))
 40 |     `((string= name ,rule-name)
 41 |       (let (,@(gen-rule-arg-bindings match-pattern)
 42 | 	    ,(get-bind-entry newtok-name))
 43 | 	,(gen-rule-arg-decls template)
 44 | 	(lambda (tokens tags pos)
 45 | 	  ,(gen-rule-closure-decl)
 46 | 	  (if ,(gen-rule-match template)
 47 | 	      (progn
 48 | ;;		(write-log tagger-contextual ,(format nil "~A: ~~A @ ~~A" (first template)) pattern pos)
 49 | 		(setf (svref tags pos) ,newtok-name))))))))
 50 | 
 51 | (defun gen-rule-closure-decl ()
 52 |   "Optimize the compiled closure through
 53 |    type and optimization declarations"
 54 |   '(declare (ignorable tokens)
 55 | 	    (type (simple-array fixnum (*)) tokens)
 56 | 	    (type (simple-array symbol (*)) tags)
 57 | 	    (type fixnum pos)
 58 | 	    (optimize (speed 3) (safety 0) (debug 0) (space 0))))
 59 | 
 60 | (defparameter *contextual-rule-args*
 61 |   (list (list 'tag1 'tags 'symbol '(mkkeysym (fourth pattern)))
 62 | 	(list 'tag2 'tags 'symbol '(mkkeysym (fifth pattern)))
 63 | 	(list 'word1 'tokens 'fixnum '(id-for-token (fourth pattern)))
 64 | 	(list 'word2 'tokens 'fixnum '(id-for-token (fifth pattern)))
 65 | 	(list 'oldtag 'tags 'symbol '(mkkeysym (first pattern)))
 66 | 	(list 'newtag 'tags 'symbol '(mkkeysym (second pattern))))
 67 |   "The templates for parsing contextual rules and constructing 
 68 |    matching templates over word/pos arrays")
 69 | 
 70 | (defun get-bind-entry (var)
 71 |   "Given a canonical variable name, create its let binding
 72 |    and extraction expression from the rule file entry"
 73 |   (aif (find var *contextual-rule-args* :key #'car)
 74 |        (list (first it) (fourth it))
 75 |        (error "Invalid variable in template pattern: ~A" var)))
 76 | 
 77 | (defun gen-rule-arg-bindings (pattern)
 78 |   "Generate let bindings for the args referenced in the match pattern"
 79 |   (labels ((get-arg (match-rec)
 80 | 		    (when (consp match-rec)
 81 | 		      (second match-rec))))
 82 |     (loop for exp in pattern nconcing 
 83 | 	  (cond ((atom exp) nil)
 84 | 		((not (consp exp))
 85 | 		 (format t "Parser error in macro, '(+1 tag1) form expected got ~A" exp))
 86 | 		((eq (car exp) 'or)
 87 | 		 (gen-rule-arg-bindings (list (cadr exp))))
 88 | 		(t (list
 89 | 		    (get-bind-entry
 90 | 		     (get-arg exp))))))))
 91 | 
 92 | (defun gen-rule-arg-decls (pattern)
 93 |   "Generate type declarations for canonical variables from table entry"
 94 |   (labels ((get-arg-type (var)
 95 | 			 (awhen (find var *contextual-rule-args* :key #'car)
 96 | 				(third it)))
 97 | 	   (make-arg-decl (match-rec)
 98 | 			  (when (consp match-rec)
 99 | 			    `(type ,(get-arg-type (second match-rec)) ,(second match-rec))))
100 | 	   (find-args (exps)
101 | 		      (loop for exp in exps nconcing
102 | 			    (cond ((atom exp) nil)
103 | 				  ((not (consp exp))
104 | 				   (format t "Parser error in macro, '(-1 tag1) form expected got: ~A" exp))
105 | 				  ((eq (car exp) 'or)
106 | 				   (find-args (cdr exp)))
107 | 				  (t (list 
108 | 				      (make-arg-decl exp)))))))
109 |     `(declare ,@(find-args (second pattern)))))
110 | 
111 | (defun gen-rule-match (pattern)
112 |   "Generate the conditional code to match this rule"
113 |   (labels ((get-array-name (var) (second (find var *contextual-rule-args* :key #'car)))
114 | 	   (get-pos-stmt (offset) (case offset
115 | 				    (0 'pos)
116 | 				    (+1 '(+ pos 1))
117 | 				    (+2 '(+ pos 2))
118 | 				    (+3 '(+ pos 3))
119 | 				    (-1 '(- pos 1))
120 | 				    (-2 '(- pos 2))
121 | 				    (-3 '(- pos 3))))
122 | 	   (parse-match-stmt (stmt)
123 | 			     (cond ((eq (first stmt) 'or)
124 | 				    `(or ,@(mapcar #'parse-match-stmt (cdr stmt))))
125 | 				   (t (let ((offset (first stmt))
126 | 					    (var (second stmt)))
127 | 					`(eq ,(case (get-array-name var)
128 | 						(tags `(aref tags ,(get-pos-stmt offset)))
129 | 						(tokens `(aref tokens ,(get-pos-stmt offset))))
130 | 					     ,var))))))
131 |     `(and ,@(mapcar #'parse-match-stmt (cdr (second pattern))))))
132 | 


--------------------------------------------------------------------------------
/src/example.lisp:
--------------------------------------------------------------------------------
 1 | ;;
 2 | ;; Silly langutils example file
 3 | ;;
 4 | ;; Use: evaluate this buffer in emacs or (load "test.lisp") from the repl
 5 | 
 6 | (in-package :langutils)
 7 | 
 8 | (defparameter test-sentence1 "This is a test of the running system.  End of sentences.  ")
 9 | (defparameter test-sentence2 "Ain't isn't a word.")
10 | (defparameter test-sentence3 "The acid rain in Spain falls mightily upon the plain.  Or does it?")
11 | 
12 | ;; Note that the last period in any string is not separated from the end word.  This is
13 | ;; a known bug.
14 | 
15 | (eval-when (eval load)
16 |   (format t "You can tokenize sentences like: ~%\"~A\"~%  as~%\"~A\"~%~%"
17 | 	  test-sentence1
18 | 	  (tag test-sentence1))
19 |   (format t "Tokenization does a few odd things: ~A -> ~A~%~%" 
20 | 	  test-sentence2 (mvretn 3 (tokenize-string test-sentence2)))
21 |   (format t "You can get the root forms of a word: ~A,~A -> ~A,~A~%~%"
22 | 	  "testing" "tested" (get-lemma "testing") (get-lemma "tested"))
23 |   (format t "Let's find some phrases within a sentence like this:~%\"~A\"~%->~%\"~A\"~%"
24 | 	  test-sentence3 (tag test-sentence3))
25 |   (format t "Phrases (nps and vps): ~A~%"
26 | 	  (chunk test-sentence3))
27 |   (format t "Get surface forms of a root (morph-surface-forms \"run\"):~%~A~%"
28 | 	  (morph-surface-forms "run"))
29 |   (format t "Oops!  All words are represented as token-id's (mapcar #'token-for-id ...) :~%~A~%~%"
30 | 	  (mapcar #'token-for-id (morph-surface-forms "run")))
31 |   (pprint "If one of the above statements doesn't do something obvious, there's a bug!"))


--------------------------------------------------------------------------------
/src/init.lisp:
--------------------------------------------------------------------------------
 1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
 2 | ;;;; *************************************************************************
 3 | ;;;; FILE IDENTIFICATION
 4 | ;;;;
 5 | ;;;; Name:          init
 6 | ;;;; Purpose:       Initialiation of the language utilities package
 7 | ;;;;
 8 | ;;;; Programmer:    Ian S. Eslick
 9 | ;;;; Date Started:  October 2004
10 | ;;;;
11 | ;;;; Notes:  Implements the subsystem convention: <op>-<package> 
12 | ;;;;   init-langutils - initialize the system, called from asdf-config startup script
13 | ;;;;   clean-langutils - Wipe all static datafiles, set system to it's clean state
14 | ;;;;   reset-langutils - Call clean & init to reset system to initial loaded state
15 | ;;;;
16 | 
17 | (in-package :langutils)
18 | 
19 | ;; NOTES ON USAGE:
20 | ;; 
21 | ;; When the system is initialized, the lexicon should be loaded first, followed by any
22 | ;; lemma files.  That way any token IDs used in the stored SQL arrays stays coherent
23 | ;; to the database.  To ensure a standard token map, you can set the variable
24 | ;; *default-token-map-file* to define a file to load from.  You must explicitely save
25 | ;; your own token map at some point.
26 | ;; 
27 | 
28 | (defun init-langutils ()
29 |   (when (if *external-token-map*
30 | 	    (and (not *lexicon*) *token-counter-hook*)
31 | 	    (not *lexicon*))
32 |     (format t "Initializing langutils...~%")
33 | 
34 |     ;; Token maps
35 |     (initialize-tokens)
36 |     (unless *external-token-map*
37 |       (reset-token-counts)
38 |       (aif *default-token-map-file* (load-token-map it)))
39 | 
40 |     ;; Lexicon (virtualize init?)
41 |     (unless *lexicon*
42 |       (format t "Loading lexicon...~%")
43 |       (time
44 |        (init-lexicon *default-lexicon-file* *default-stems-file*)))
45 | 
46 |     ;; Tagger
47 |     (unless (and *tagger-lexical-rules* *tagger-contextual-rules*)
48 |       (format t "Loading tagger rule sets...~%")
49 |       (init-tagger *default-lexical-rule-file* *default-contextual-rule-file*))
50 | 
51 |     ;; Stopword db
52 |     (format t "Finishing miscellaneous langutils setup.~%")
53 |     (init-stopwords *default-stopwords-file*)
54 |     (init-concise-stopwords *default-concise-stopwords-file*)
55 | 
56 |     ;; Concepts
57 |     (clear-concept-cache)
58 | 
59 |     (format t "Done initializing langutils.~%")
60 |     t))
61 | 
62 | (defun clean-langutils ()
63 |   (reset-token-counts)
64 | ;;  (reset-token-tables)
65 |   (initialize-tokens)
66 |   (clean-lexicon)
67 |   (clean-tagger)
68 |   (clean-stopwords))
69 | 
70 | (defun reset-langutils ()
71 |   (clean-langutils)
72 |   (init-langutils))
73 | 
74 | (eval-when (:load-toplevel)
75 |   ;; Read config
76 |   (when *auto-init*
77 |     (init-langutils)))
78 | 


--------------------------------------------------------------------------------
/src/lemma.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          lemma
  6 | ;;;; Purpose:       Identify the lemma of a given word or token in the lexicon
  7 | ;;;;                or guess the root using the porter algorithm
  8 | ;;;;
  9 | ;;;; Programmer:    Ian S. Eslick
 10 | ;;;; Date Started:  October 2004
 11 | ;;;;
 12 | 
 13 | (in-package :langutils)
 14 | 
 15 | ;;;; find root
 16 | 
 17 | (defun get-lemma (word &key pos (noun t) porter)
 18 |   "Provides the root word string for the provided word string"
 19 |   (token-for-id (get-lemma-for-id (id-for-token word) :pos pos :noun noun :porter porter)))
 20 | 
 21 | (defun get-lemma-for-id (id &key pos (noun t) (porter nil))
 22 |   "Returns a lemma id for the provided word id.  pos only
 23 |    returns the root for the provided pos type.  noun will
 24 |    stem nouns to the singular form by default and porter
 25 |    determines whether the porter algorithm is used for 
 26 |    unknown terms.  pos type causes the noun argument to be
 27 |    ignored"
 28 |   (aif (get-lexicon-entry id)
 29 |        ;; Use lexicon if we can, should have all root forms
 30 |        (let ((roots (lexicon-entry-roots it)))
 31 | 	(if roots
 32 | 	    (cond ((and pos noun) ;; POS given and lemmatizing nouns
 33 | 		   (aif (assoc pos roots)
 34 | 			(values (cdr it) (car it))
 35 | 			(values id nil)))
 36 | 		  ((and pos (not noun)) ;; POS speech given, don't lemmatize nouns
 37 | 		   (aif (and (not (find pos '(:NN :NNS)))
 38 | 			     (assoc pos roots))
 39 | 			(values (cdr it) (car it))
 40 | 			(values id nil)))
 41 | 		  ((and (not pos) noun) ;; lemmatize everything, no pos so just return default root
 42 | 		   (values (cdar roots) ;; only return pos if id is different so can test root-found on 2nd value
 43 | 			   (when (not (eq (cdar roots) id))
 44 | 			     (caar roots))))
 45 | 		  ((and (not pos) (not noun)) ;; don't lemmatize nouns so return default unless word has a noun form
 46 | 		   (if (or (eq (caar roots) :NN)
 47 | 			   (eq (caar roots) :NNS)
 48 | 			   (eq (caar roots) :NNP))
 49 | 		       (values id nil)
 50 | 		     (values (cdar roots) (caar roots)))))
 51 | 	  ;; else assume I am the root
 52 | 	  (values id nil)))
 53 |        ;; Use porter algorithm if we have no lexicon entry
 54 |        (if porter
 55 | 	   (let ((str (token-for-id id)))
 56 | 	     (values (id-for-token (stem str)) :NN)) ;; guess noun
 57 | 	 (values id nil))))
 58 | 
 59 | ;; ========================
 60 | ;; General interface
 61 | ;; ========================
 62 | 
 63 | (defun *get-determiners* ()
 64 |   (mapcar #'id-for-token '("the" "a" "an")))
 65 | 
 66 | (defun select-token (token &key strip-det noun pos porter (lemma t))
 67 |   "Internal per-token function"
 68 |   (if (and strip-det (find token (*get-determiners*)))
 69 |       nil
 70 |     (if lemma 
 71 | 	(get-lemma-for-id token :noun noun :pos pos :porter porter)
 72 |       token)))
 73 | 
 74 | (defmethod-exported lemmatize ((sequence list) &key strip-det pos (noun t) porter last-only)
 75 |   "Non-destructive lemmatization of provided sequence"
 76 |   (labels ((select (token orig)
 77 | 		   (select-token token 
 78 | 				 :lemma (if (and last-only (not (last-elt? orig))) nil t)
 79 | 				 :strip-det strip-det 
 80 | 				 :pos pos 
 81 | 				 :noun noun 
 82 | 				 :porter porter))
 83 | 	   (last-elt? (list)
 84 | 		      (null (cdr list)))
 85 | 	   (descend (orig new)
 86 | 		    (if (null orig) 
 87 | 			(nreverse new)
 88 | 		      (descend (cdr orig) 
 89 | 			       (aif (select (car orig) orig) 
 90 | 				    (cons it new) 
 91 | 				    new)))))
 92 |     (when sequence
 93 |       (assert (integerp (car sequence)))
 94 |       (descend sequence nil))))
 95 |       
 96 | 
 97 | (defmethod-exported lemmatize ((sequence array) &key strip-det pos (noun t) porter last-only)
 98 |   (let ((new (make-array (length sequence) :element-type 'fixnum :adjustable t))
 99 | 	(last (1- (length sequence)))
100 | 	(index 0))
101 |     (loop for token across sequence do
102 | 	  (awhen (select-token token 
103 | 			       :lemma (if (and last-only (not last)) nil t)
104 | 			       :strip-det strip-det 
105 | 			       :pos pos 
106 | 			       :noun noun 
107 | 			       :porter porter)
108 | 		 (setf (aref new index) it)
109 | 		 (incf index)))
110 |     new))
111 | 
112 | ;; ------------------------------
113 | ;; Find surface forms from lemma
114 | ;; ------------------------------
115 | 
116 | (defvar *pos-class-map*
117 |   '((:VB :V)
118 |     (:VBD :V)
119 |     (:VBN :V)
120 |     (:VBG :V)
121 |     (:VBZ :V)
122 |     (:JJ :A)
123 |     (:NN :N)
124 |     (:NNS :N)
125 |     (:NNP :N)))
126 | 
127 | (defun morph-surface-forms-text (root &optional pos-class)
128 |   (mapcar #'token-for-id (morph-surface-forms root pos-class)))
129 | 
130 | (defun morph-case-surface-forms (root &optional (pos-class nil))
131 |   "All cases of morphological surface forms of the provided root"
132 |   (let ((forms (morph-surface-forms root pos-class)))
133 |     (append forms
134 | 	    (nflatten 
135 | 	     (mapcar #'get-lexicon-case-forms forms)))))
136 | 	     
137 | 
138 | (defun morph-surface-forms (root &optional (pos-class nil))
139 |   "Takes a word or id and returns all surface form ids or all forms of
140 |    class 'pos-class' where pos-class is a symbol of langutils::V,A,N"
141 |   (labels ((surface-forms (root-pairs)
142 | 	  (remove-duplicates
143 | 	   (flatten
144 | 	    (mapcar #'(lambda (root-pair)
145 | 			(lexicon-entry-surface-forms
146 | 			 (get-lexicon-entry (cdr root-pair))))
147 | 		    root-pairs))))
148 | 	   (filter-by-pos (surface-forms)
149 | 	     (select-if #'(lambda (word)
150 | 			    (let ((entry (get-lexicon-entry word)))
151 | 			      (some #'(lambda (tag)
152 | 					(in-pos-class? tag pos-class)) 
153 | 				    (lexicon-entry-tags entry))))
154 | 			surface-forms)))
155 |   (let* ((all-root-pairs (lexicon-entry-roots (get-lexicon-entry root)))
156 |          (all-surface (surface-forms all-root-pairs)))
157 |     (when all-surface
158 |       (if pos-class
159 | 	  (filter-by-pos all-surface)
160 | 	all-surface)))))
161 | 
162 | 
163 | (defun-exported in-pos-class? (element class)
164 |   (aif (and (or (eq class :V)
165 | 	      (eq class :N)
166 | 	      (eq class :A))
167 | 	  (assoc element *pos-class-map*))
168 |        (eq (cadr it) class)))
169 |   
170 |   
171 | 
172 | 
173 | 
174 |   
175 | 


--------------------------------------------------------------------------------
/src/lexicon.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          lexicon
  6 | ;;;; Purpose:       The BROWN/WSJ lexicon, extendable, supports all langutils
  7 | ;;;;
  8 | ;;;; Programmer:    Ian S. Eslick
  9 | ;;;; Date Started:  October 2004
 10 | ;;;;
 11 | 
 12 | (in-package :langutils)
 13 | 
 14 | ;; -----------------------------
 15 | ;; LEXICON
 16 | 
 17 | ;; The Lexicon
 18 | (defvar *lexicon* nil)
 19 | 
 20 | (defmacro with-static-memory-allocation (() &rest body)
 21 |   `(progn ,@body))
 22 | 
 23 | ;; Loading the lexicon
 24 | (defun init-lexicon (&optional lexicon-file lemma-file)
 25 |   "Populates the lexicon with 'word tag1 tag2' structured
 26 |    lines from lexicon-file"
 27 |   (unless *lexicon* 
 28 |     (write-log lexicon-init "Initializing the lexicon")
 29 |     (setf *lexicon* (make-hash-table :size 100000 :rehash-size 1.3 :rehash-threshold 0.8))
 30 |     (let ((lexicon-file (aif lexicon-file it 
 31 | 			     (translate-logical-pathname "think:data;lang;en;langutils;LEXICON-BROWN-AND-WSJ.txt")))
 32 | 	  (lemma-file (aif lemma-file it
 33 | 			   (translate-logical-pathname "think:data;lang;en;langutils;stem-dict.txt"))))
 34 |       (with-static-memory-allocation ()
 35 | 	(write-log lexicon-init "Reading lexicon from ~A" lexicon-file)
 36 | 	;; Parse the lines into a predicate ID and two node structures
 37 | 	(with-open-file ( s lexicon-file :external-format :iso-8859-1)
 38 | 	  (do-count-contentful-lines (l count s)
 39 | 	    (when (= (mod count 10000) 0) (write-log lexicon-init "Processed ~A lines" count))
 40 | 	    (let ((lexicon-entry (extract-words l))) ;; (pregex:split "\\s+" l)))
 41 | 	      (add-basic-entry (car lexicon-entry) (mapcar #'mkkeysym (cdr lexicon-entry))))))
 42 | 	(write-log lexicon-init "Reading word->lemma data from ~A" lemma-file)
 43 | 	;; Parse the lines into a predicate ID and possible roots
 44 | 	(with-open-file ( s lemma-file :external-format :iso-8859-1)
 45 | 	  (do-count-contentful-lines (l count s)
 46 | 	    (when (= (mod count 10000) 0) (write-log lexicon-init "Processed ~A lines" count))
 47 | 	    (let ((roots-entry (extract-words l)))
 48 | 	      (add-roots (first roots-entry) 
 49 | 			 (mapcar #'(lambda (root+pos) 
 50 | 				     (cons (mkkeysym (cdr root+pos)) 
 51 | 					   (id-for-token (car root+pos))))
 52 | 				 (pairs (cdr roots-entry)))))))))))
 53 | 
 54 | (defun clean-lexicon ()
 55 |   (setf *lexicon* nil))
 56 | 
 57 | ;; 
 58 | ;; Internals and accessors
 59 | ;; 
 60 | 
 61 | (defstruct (lexicon-entry 
 62 | 	    (:conc-name "LEXICON-ENTRY-"))
 63 |   ;; Base lexicon (human entered)
 64 |   tags
 65 |   id 
 66 |   roots
 67 |   surface-forms
 68 |   case-forms)
 69 | 
 70 | (defun lexicon-entry-tag (entry)
 71 |   (aif entry (car (lexicon-entry-tags entry)) nil))
 72 | 
 73 | (defun get-lexicon-entry (word)
 74 |   (etypecase word
 75 |     (string (hash-get *lexicon* (id-for-token word)))
 76 |     (integer (hash-get *lexicon* word))))
 77 | 
 78 | (defun set-lexicon-entry (word entry)
 79 |   (ecase word
 80 |     (string (hash-put *lexicon* (id-for-token word) entry))
 81 |     (integer (hash-put *lexicon* word entry))))
 82 | 
 83 | (defsetf get-lexicon-entry set-lexicon-entry)
 84 | 
 85 | (defun get-lexicon-default-pos (word)
 86 |   (awhen (get-lexicon-entry word)
 87 | 	 (lexicon-entry-tag it)))
 88 | 
 89 | (defun get-lexicon-case-forms (word)
 90 |   (awhen (get-lexicon-entry word)
 91 | 	 (lexicon-entry-case-forms it)))
 92 | 
 93 | 
 94 | ;;
 95 | ;; Modifying the lexicon
 96 | ;;
 97 | 
 98 | (defun ensure-lexicon-entry (word &key roots surface)
 99 |   (aif (get-lexicon-entry word)
100 |        it
101 |        (add-basic-entry word 
102 | 		    (mapcar #'car roots)
103 | 		    :roots roots :surface surface)))
104 | 
105 | (defun add-basic-entry ( word tags &key roots surface)
106 |   "Add a word and it's probability ordered tags to the lexicon"
107 |   (unless (and (stringp word) (string= word ""))
108 |     (let* ((id (etypecase word
109 | 		 (string (id-for-token word))
110 | 		 (integer word)))
111 | 	   (cases (make-cases 
112 | 		   (etypecase word
113 | 		     (string word)
114 | 		     (integer (token-for-id word)))))
115 | 	   (entry (make-lexicon-entry 
116 | 		   :id id
117 | 		   :tags tags 
118 | 		   :roots roots
119 | 		   :surface-forms surface
120 | 		   :case-forms cases)))
121 |       (hash-put *lexicon* id entry)
122 |       entry)))
123 | 
124 | (defun make-cases ( word )
125 | ;;  (declare (type (string word)))
126 |   (let ((all-cases 
127 | 	 (mapcar #'id-for-token
128 | 		 (list (string-downcase word)
129 | 		       (string-upcase word)
130 | 		       (concatenate 'string 
131 | 				    (string-upcase (subseq word 0 1))
132 | 				    (string-downcase (subseq word 1)))))))
133 |     (remove (id-for-token word) all-cases)))
134 | 
135 | (defun add-unknown-lexicon-entry (word guessed-tag)
136 |   (let ((id (etypecase word
137 | 	      (string (id-for-token word))
138 | 	      (integer word))))
139 |     (hash-put *lexicon* id
140 | 	    (make-lexicon-entry
141 | 	     :id id
142 | 	     :tags (list guessed-tag)
143 | 	     :surface-forms nil
144 | 	     :case-forms nil
145 | 	     :surface-forms nil))))
146 | 
147 | (defun add-roots ( word root-pairs )
148 |   "Set the root list (pairs of pos_type/root) for the entry for 'word'"
149 |   (let ((entry (ensure-lexicon-entry word :roots root-pairs)))
150 |     ;; Add roots and surface forms
151 |     (add-root-forms word root-pairs)
152 |     ;; Ensure default tags get set
153 |     (when (null (lexicon-entry-tags entry))
154 |       (setf (lexicon-entry-tags entry)
155 | 	    (mapcar #'car (lexicon-entry-roots entry))))))
156 | 
157 | (defun add-root-forms ( word pos-root-pairs )
158 |   (let ((id (etypecase word
159 | 	      (string (id-for-token word))
160 | 	      (integer word)))
161 | 	(pr-pairs (reverse pos-root-pairs))) ;; get ordering right in object
162 |     ;; Add roots to entry
163 |     (mapc #'(lambda (x) (add-root id x))
164 | 	  pr-pairs)
165 |     ;; Add surface form to each root
166 |     (mapc #'(lambda (x) (add-surface-form (cdr x) id))
167 | 	  pr-pairs)))
168 | 	
169 | (defun add-root ( word pos-root-pair)
170 |   "Add a root form to word if not exists"
171 |   (let ((entry (ensure-lexicon-entry word :roots (list pos-root-pair))))
172 |     (pushnew pos-root-pair (lexicon-entry-roots entry) :test #'equal)))
173 |   
174 | (defun add-surface-form ( root surface-form )
175 |   "Add a surface form to a root word"
176 |   ;; (assert (and (numberp surface-form) (numberp root)))
177 |   (let ((entry (ensure-lexicon-entry root :surface (list surface-form))))
178 |     (pushnew surface-form (lexicon-entry-surface-forms entry) :test #'equal)))
179 | 


--------------------------------------------------------------------------------
/src/my-meta.lisp:
--------------------------------------------------------------------------------
  1 | ;;;;										;
  2 | ;;;; (c) 2001 by Jochen Schmidt.
  3 | ;;;;
  4 | ;;;; File:            meta.lisp
  5 | ;;;; Revision:        1.0.0
  6 | ;;;; Description:     A simple parsing technique
  7 | ;;;; Date:            01.07.2001
  8 | ;;;; Authors:         Jochen Schmidt
  9 | ;;;; Tel:             (+49 9 11) 47 20 603
 10 | ;;;; Email:           jsc@dataheaven.de
 11 | ;;;;
 12 | ;;;; Redistribution and use in source and binary forms, with or without
 13 | ;;;; modification, are permitted provided that the following conditions
 14 | ;;;; are met:
 15 | ;;;; 1. Redistributions of source code must retain the above copyright
 16 | ;;;;    notice, this list of conditions and the following disclaimer.
 17 | ;;;; 2. Redistributions in binary form must reproduce the above copyright
 18 | ;;;;    notice, this list of conditions and the following disclaimer in the
 19 | ;;;;    documentation and/or other materials provided with the distribution.
 20 | ;;;;
 21 | ;;;; THIS SOFTWARE IS PROVIDED "AS IS" AND THERE ARE NEITHER 
 22 | ;;;; EXPRESSED NOR IMPLIED WARRANTIES -  THIS INCLUDES, BUT 
 23 | ;;;; IS NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 24 | ;;;; AND FITNESS FOR A PARTICULAR PURPOSE.IN NO WAY ARE THE
 25 | ;;;; AUTHORS LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 26 | ;;;; SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 27 | ;;;; NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ;
 28 | ;;;; LOSS OF USE, DATA, OR PROFITS			; OR BUSINESS INTERRUPTION)
 29 | ;;;; 
 30 | ;;;; For further details contact the authors of this software.
 31 | ;;;;
 32 | ;;;;  Jochen Schmidt        
 33 | ;;;;  Zuckmantelstr. 11     
 34 | ;;;;  91616 Neusitz         
 35 | ;;;;  GERMANY               
 36 | ;;;;
 37 | ;;;;
 38 | ;;;; NOTE:
 39 | ;;;; This code is based on the well known paper "Pragmatic Parsing in Common Lisp"
 40 | ;;;; of Henry G. Baker. You can find it at:
 41 | ;;;;
 42 | ;;;;    http://linux.rice.edu/~rahul/hbaker/Prag-Parse.html
 43 | ;;;;
 44 | ;;;; The parsing technique Baker describes in his paper goes back to:
 45 | ;;;;
 46 | ;;;;     Schorre, D.V.  "META II: A Syntax-Oriented Compiler Writing Language".
 47 | ;;;;       Proc. 19'th Nat'l. Conf. of the ACM (Aug. 1964),D1.3-1-D1.3-11.
 48 | ;;;;
 49 | ;;;;
 50 | ;;;; Nurnberg, 01.Jul.2001 Jochen Schmidt
 51 | 
 52 | ;;;; Extensions by Ian Eslick, MIT Media Laboratory 2005
 53 | 
 54 | (in-package :my-meta)
 55 | 
 56 | ;;; String matching
 57 | (defmacro string-match (x &key source-symbol)
 58 |   (etypecase x
 59 |     (character
 60 |      `(when (and (< index end) (eql (char ,source-symbol index)  ,x))
 61 |         (incf index)))
 62 |     (string
 63 |      (let ((old-index-symbol (gensym "OLD-INDEX-")))
 64 |        `(let ((,old-index-symbol index))
 65 |           (or (and ,@(map 'list #'(lambda (c) `(string-match ,c
 66 |                                                              :source-symbol ,source-symbol)) x))
 67 |               (progn (setq index ,old-index-symbol) nil)))))))
 68 | 
 69 | (defmacro string-match-type (x v &key source-symbol)
 70 |   (let ((char-sym (gensym)))
 71 |     `(when (< index end)
 72 |        (let ((,char-sym (char ,source-symbol index)))
 73 |          (declare (base-char ,char-sym))
 74 |          (when (typep ,char-sym ',x)
 75 |            (setq ,v ,char-sym) (incf index))))))
 76 | 
 77 | 
 78 | ;;; List matching
 79 | (defmacro list-match (x &key source-symbol); sublist uses new lexical index
 80 |  `(when (and (consp ,source-symbol)
 81 |              ,(if (atom x) `(eql (car ,source-symbol) ',x)
 82 |                 `(let ((,source-symbol (car ,source-symbol))) ,(compile-list x :source-symbol source-symbol))))
 83 |     (pop ,source-symbol) t))
 84 | 
 85 | (defmacro list-match-type (x v &key source-symbol)
 86 |   `(when (and (consp ,source-symbol) (typep (car ,source-symbol) ',x))
 87 |      (setq ,v (car ,source-symbol)) (pop ,source-symbol) t))
 88 | 
 89 | (defun compile-list (l &key source-symbol)
 90 |   (if (atom l) `(eql ,source-symbol ',l)
 91 |       `(and ,(compileit (car l) :meta-parser-type :list :source-symbol source-symbol)
 92 |             ,(compile-list (cdr l) :source-symbol source-symbol))))
 93 | 
 94 | 
 95 | ;;; Stream matching
 96 | (defmacro stream-match (x &key source-symbol)
 97 |   `(when (eql (peek-char nil ,source-symbol) ',x) (read-char ,source-symbol)))
 98 | 
 99 | (defmacro stream-match-type (x v &key source-symbol)
100 |   `(when (typep (peek-char nil ,source-symbol) ',x) (setq ,v (read-char ,source-symbol))))
101 | 
102 | (defstruct (meta
103 |             (:print-function
104 |              (lambda (m s d &aux (char (meta-char m)) (form (meta-form m)))
105 |                (declare (ignore d))
106 |                (ecase char
107 |                  ((#\@ #\! #\$) (format s "~A~A" char form))
108 |                  (#\[ (format s "[~{~A~^ ~}]" form))
109 |                  (#\{ (format s "{~{~A~^ ~}}" form))))))
110 |     char
111 |     form)
112 | 
113 | (defun symbol-name-equal (src target &key key (test #'equal))
114 |   (funcall test (symbol-name src) (symbol-name (if key (funcall key target) target))))
115 | 
116 | (defun compileit (x &key meta-parser-type source-symbol)
117 | 	(typecase x
118 | 		(meta
119 | 			(ecase (meta-char x)
120 | 				(#\! (cond ((symbol-name-equal 'meta-dict (meta-form x) :key #'first)
121 | 					    (let ((dict (eval (second (meta-form x)))))
122 | 					      (unless (consp dict)
123 | 						(error "Meta Dictionaries are lists of strings, 
124 |                                                         provided input is not list"))
125 | 					      `(or ,@(mapcar #'(lambda (f) (compileit f
126 | 										      :meta-parser-type meta-parser-type
127 | 										      :source-symbol source-symbol))
128 | 							     dict))))
129 | 					   (t (meta-form x))))
130 | 				(#\[ `(and ,@(mapcar #'(lambda (f) (compileit f
131 |                                                               :meta-parser-type meta-parser-type
132 |                                                               :source-symbol source-symbol))
133 |                                                      (meta-form x))))
134 | 				(#\{ `(or ,@(mapcar #'(lambda (f) (compileit f
135 |                                                              :meta-parser-type meta-parser-type
136 |                                                              :source-symbol source-symbol))
137 |                                                     (meta-form x))))
138 | 				(#\$ `(not (do () ((not ,(compileit (meta-form x)
139 |                                                     :meta-parser-type meta-parser-type
140 |                                                     :source-symbol source-symbol))))))
141 | 				(#\@ (let ((f (meta-form x))) (list (ecase meta-parser-type
142 |                                                                       (:list 'list-match-type)
143 |                                                                       (:string 'string-match-type)
144 |                                                                       (:stream 'stream-match-type))
145 |                                                                     (car f) (cadr f)
146 |                                                                     :source-symbol source-symbol
147 |                                                                     )))))
148 | 		(t (list (ecase meta-parser-type
149 |                            (:list 'list-match)
150 |                            (:string 'string-match)
151 |                            (:stream 'stream-match))
152 |                          x
153 |                          :source-symbol source-symbol
154 |                          ))))
155 | 
156 | (defparameter *saved-readtable* (copy-readtable))
157 | (defparameter *meta-readtable* (copy-readtable))
158 | 
159 | (defun meta-reader (s c) (make-meta :char c :form (read s)))
160 | 
161 | 
162 | (mapc #'(lambda (c) (set-macro-character c #'meta-reader nil *meta-readtable*)) '(#\@ #\$ #\!))
163 | 
164 | (set-macro-character #\{
165 | 	#'(lambda (s c) (make-meta :char c :form (read-delimited-list #\} s t))) nil *meta-readtable*)
166 | 
167 | (set-macro-character #\[
168 | 	#'(lambda (s c) (make-meta :char c :form (read-delimited-list #\] s t))) nil *meta-readtable*)
169 | 
170 | (mapc #'(lambda (c) (set-macro-character c (get-macro-character #\))  nil *meta-readtable*))
171 | 	'(#\] #\}))
172 | 
173 | 
174 | (defmacro with-stream-meta ((source-symbol stream) &body body)
175 |   `(let ((,source-symbol ,stream))
176 |      (macrolet ((meta-match (x)
177 |                        (compileit x
178 |                                   :meta-parser-type :stream
179 |                                   :source-symbol ',source-symbol)))
180 |        ,@body)))
181 | 
182 | (defmacro with-string-meta ((source-symbol string-buffer &key (start 0) end) &body body)
183 |   `(let* ((,source-symbol ,string-buffer)
184 |           (index ,start)
185 |           (end ,(or end `(length ,source-symbol))))
186 |      (declare (fixnum index end)
187 |               (type simple-base-string ,source-symbol))
188 | ;;	      (type base-string ,source-symbol))
189 |      (macrolet ((meta-match (x)
190 |                        (compileit x
191 |                                   :meta-parser-type :string
192 |                                   :source-symbol ',source-symbol)))
193 |              ,@body)))
194 | 
195 | 
196 | (defmacro with-list-meta ((source-symbol list) &body body)
197 |   `(let ((,source-symbol ,list))
198 |      (macrolet ((meta-match (x)
199 |                        (compileit x
200 |                                   :meta-parser-type :list
201 |                                   :source-symbol ',source-symbol)))
202 |        ,@body)))
203 | 
204 | (defun enable-meta-syntax ()
205 | 	(copy-readtable *meta-readtable* *readtable*))
206 | 
207 | (defun disable-meta-syntax()
208 | 	(copy-readtable *saved-readtable* *readtable*))
209 | 
210 | (provide 'meta)
211 | 
212 | #|
213 | 
214 | (eval-when (compile load eval)
215 |   (deftype digit () '(member #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
216 | 
217 |   (deftype non-digit () '(not (member #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)))
218 | 
219 |   (defun ctoi (d) (- (char-code d) #.(char-code #\0)))
220 | )
221 | 
222 | (eval-when (compile load eval)
223 |   (enable-meta-syntax)
224 | )
225 | 
226 | 
227 | (defun parse-int (string &aux (s +1) d (n 0))
228 |   (with-string-meta (buffer string)
229 |                     (and
230 |                      (match
231 |                       [{#\+ [#\- !(setq s -1)] []}
232 |                             @(digit d) !(setq n (ctoi d))
233 |                             $[@(digit d) !(setq n (+ (* n 10) (ctoi d)))]])
234 |                      (* s n))))
235 | 
236 | (eval-when (compile load eval)
237 | (disable-meta-syntax)
238 | )
239 | 
240 | |#
241 | 


--------------------------------------------------------------------------------
/src/package.lisp:
--------------------------------------------------------------------------------
  1 | ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
  2 | 
  3 | ;; Language utilities asd files
  4 | 
  5 | (defpackage :my-meta
  6 |   (:use #:common-lisp)
  7 |   (:export #:with-string-meta
  8 |            #:with-list-meta
  9 |            #:with-stream-meta
 10 |            #:enable-meta-syntax
 11 |            #:disable-meta-syntax
 12 |            #:index
 13 |            #:end
 14 |            #:meta-match))
 15 | 
 16 | (defpackage #:langutils-tokenize
 17 |   (:use #:cl #:my-meta)
 18 |   (:export tokenize-stream
 19 | 	   tokenize-string
 20 | 	   tokenize-file))
 21 | 
 22 | (defpackage #:langutils
 23 |   (:use #:cl #:stdutils)
 24 |   (:import-from :langutils-tokenize 
 25 | 		tokenize-stream
 26 | 		tokenize-string
 27 | 		tokenize-file)
 28 |   (:export   ;; initialization
 29 |            init-langutils
 30 | 	   clean-langutils
 31 | 	   reset-langutils
 32 | 	   ;; vector documents
 33 | 	   vector-document
 34 | 	   make-vector-document
 35 | 	   document-text
 36 | 	   document-tags
 37 | 	   document-annotations
 38 | 	   vector-document-words
 39 | 	   get-token-id
 40 | 	   get-tag
 41 | 	   string-tag
 42 | 	   length-of
 43 | 	   string-tag-tokenized
 44 | 	   print-vector-document
 45 | 	   vector-document-string
 46 | 	   write-vector-document
 47 | 	   read-vector-document
 48 | 	   read-vector-document-to-string
 49 | 	   ;; phrases in documents
 50 | 	   make-phrase
 51 | 	   make-phrase-from-sentence
 52 | 	   make-phrase-from-vdoc
 53 | 	   phrase
 54 | 	   phrase-start
 55 | 	   phrase-end
 56 | 	   phrase-type
 57 | 	   phrase-document
 58 | 	   phrase-length
 59 | 	   phrase-equal
 60 | 	   phrase-overlap
 61 | 	   print-phrase
 62 | 	   print-window
 63 | 	   phrase->string
 64 | 	   phrase->token-array
 65 | 	   phrase-words
 66 | 	   phrase-distance
 67 | 	   phrase-lemmas
 68 | 	   print-phrase-lemmas
 69 | 	   find-phrase
 70 | 	   find-phrase-intervals
 71 | 	   change-word
 72 | 	   remove-word
 73 | 	   add-word
 74 | 	   lemmatize-phrase
 75 | 	   get-annotation
 76 | 	   set-annotation
 77 | 	   unset-annotation
 78 | 	   ;; altered phrases
 79 | 	   altered-phrase
 80 | 	   make-alterable-phrase
 81 |            ;; tokens
 82 | 	   id-for-token
 83 | 	   ids-for-tokens
 84 | 	   token-for-id
 85 | 	   tokens-for-ids
 86 | 	   save-tokens
 87 | 	   suspicious-word?
 88 | 	   suspicious-string?
 89 | 	   string->token-array
 90 | 	   tokenized-string->token-array
 91 | 	   ;; lexicon
 92 | 	   get-lexicon-default-pos
 93 | 	   get-lexicon-entry
 94 | 	   get-lexicon-case-forms
 95 | 	   lexicon-entry
 96 | 	   lexicon-entry-tag
 97 | 	   lexicon-entry-tags
 98 | 	   lexicon-entry-id
 99 | 	   lexicon-entry-roots
100 | 	   lexicon-entry-surface-forms
101 | 	   add-lexicon-entry
102 | 	   add-lemma
103 | 	   ;; lemma
104 | 	   get-lemma
105 | 	   get-lemma-for-id
106 | 	   morph-surface-forms
107 | 	   morph-case-surface-forms
108 | 	   morph-surface-forms-text
109 | 	   ;; tokenizer
110 | 	   tokenize-stream
111 | 	   tokenize-string
112 | 	   tokenize-file
113 | 	   ;; text tagger
114 | 	   tag 
115 | 	   tag-tokenized 
116 | 	   ;; vector tagger
117 | 	   vector-tag 
118 | 	   vector-tag-tokenized 
119 | 	   initial-tag 
120 | 	   ;; chunker
121 | 	   chunk
122 | 	   chunk-tokenized
123 | 	   all-chunks
124 | 	   get-event-chunks
125 | 	   get-extended-event-chunks1
126 | 	   get-extended-event-chunks2
127 | 	   get-nx-chunks
128 | 	   get-vx-chunks
129 | 	   get-adverb-chunks
130 | 	   get-imperative-chunks
131 | 	   get-p-chunks
132 | 	   get-pp-chunks
133 | 	   head-verbs
134 | 	   head-verb
135 | 	   root-nouns
136 | 	   root-noun
137 | 	   ;; stopwords
138 | 	   stopword?
139 | 	   contains-is?
140 | 	   string-stopword?
141 | 	   string-contains-is?
142 | 	   concise-stopword?
143 | 	   string-concise-stopword?
144 | 	   ))
145 | 
146 | (in-package #:langutils)
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/src/porter.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | 
  3 | (in-package :langutils)
  4 | 
  5 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  6 | ;; The Porter Stemming Algorithm, somewhat mechanically hand translated to Common Lisp by
  7 | ;; Steven M. Haflich smh@franz.com Feb 2002.  Most of the inline comments refer to the
  8 | ;; original C code.  At the time of this translation the code passes the associated Porter
  9 | ;; test files.  See the function test at the end of this file.
 10 | 
 11 | ;; This port is intended to be portable ANSI Common Lisp.  However, it has only been
 12 | ;; compiled and tested with Allegro Common Lisp.  This code is offered in the hope it will
 13 | ;; be useful, but with no warranty of correctness, suitability, usability, or anything
 14 | ;; else.  The C implementation from which this code was derived was not reentrant, relying
 15 | ;; on global variables.  This implementation corrects that.  It is intended that a word to
 16 | ;; be stemmed will be in a string with fill-pointer, as this is a natural result when
 17 | ;; parsing user input, web scraping, whatever.  If not, a string with fill-pointer is
 18 | ;; created, but this is an efficiency hit and is here intended only for lightweight use or
 19 | ;; testing.  Using some resource mechanism on these strings would be a useful improvement,
 20 | ;; whether here or in the calling code.
 21 | 
 22 | ;; Postscript: When I contacted Martin Porter about this anachronism, he decided to fix
 23 | ;; the C version to implement proper reentrancy.  The CL version is now also served from
 24 | ;; his central site.  It should be functionally identical to this one, modulo the current
 25 | ;; comment and a couple harmless formatting and comment changes.
 26 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 27 | 
 28 | ;; This is the Porter stemming algorithm, coded up in ANSI C by the
 29 | ;; author. It may be be regarded as cononical, in that it follows the
 30 | ;; algorithm presented in
 31 | 
 32 | ;; Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
 33 | ;; no. 3, pp 130-137,
 34 | 
 35 | ;; only differing from it at the points maked --DEPARTURE-- below.
 36 | 
 37 | ;; See also http://www.tartarus.org/~martin/PorterStemmer
 38 | 
 39 | ;; The algorithm as described in the paper could be exactly replicated
 40 | ;; by adjusting the points of DEPARTURE, but this is barely necessary,
 41 | ;; because (a) the points of DEPARTURE are definitely improvements, and
 42 | ;; (b) no encoding of the Porter stemmer I have seen is anything like
 43 | ;; as exact as this version, even with the points of DEPARTURE!
 44 | 
 45 | ;; You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
 46 | ;; 'stem' takes a list of inputs and sends the stemmed equivalent to
 47 | ;; stdout.
 48 | 
 49 | ;; The algorithm as encoded here is particularly fast.
 50 | 
 51 | ;; Release 1
 52 | 
 53 | ;; The main part of the stemming algorithm starts here. b is a buffer
 54 | ;; holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
 55 | ;; ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
 56 | ;; downwards as the stemming progresses. Zero termination is not in fact
 57 | ;; used in the algorithm.
 58 | 
 59 | ;; Note that only lower case sequences are stemmed. Forcing to lower case
 60 | ;; should be done before stem(...) is called.
 61 | 
 62 | ;; cons(i) is TRUE <=> b[i] is a consonant.
 63 | 
 64 | ;;; Common Lisp port Version 1.01
 65 | 
 66 | ;;;
 67 | ;;; Common Lisp port Version history
 68 | ;;;
 69 | ;;; 1.0  -- smh@franz.com Feb 2002
 70 | ;;;         initial release
 71 | ;;;
 72 | ;;; 1.01 -- smh@franz.com 25 Apr 2004
 73 | ;;;         step4 signalled error for "ion" "ions".  Thanks to Jeff Heard
 74 | ;;;         for detecting this and suggesting the fix.
 75 | 
 76 | (defun consonantp (str i)
 77 |   (let ((char (char str i)))
 78 |     (cond ((member char '(#\a #\e #\i #\o #\u)) nil)
 79 | 	  ((eql char #\y)
 80 | 	   (if (= i 0) t (not (consonantp str (1- i)))))
 81 | 	  (t t))))
 82 | 
 83 | ;; m() measures the number of consonant sequences between k0 and j. if c is
 84 | ;; a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 85 | ;; presence,
 86 | 
 87 | ;;    <c><v>       gives 0
 88 | ;;    <c>vc<v>     gives 1
 89 | ;;    <c>vcvc<v>   gives 2
 90 | ;;    <c>vcvcvc<v> gives 3
 91 | ;;    ....
 92 | 
 93 | (defun m (str lim)
 94 |   (let ((n 0)
 95 | 	(i 0))
 96 |     (loop
 97 |       (when (>= i lim) (return-from m n))
 98 |       (if (not (consonantp str i)) (return nil))
 99 |       (incf i))
100 |     (incf i)
101 |     (loop
102 |       (loop
103 | 	(if (>= i lim) (return-from m n))
104 | 	(if (consonantp str i) (return nil))
105 | 	(incf i))
106 |       (incf i)
107 |       (incf n)
108 |       (loop
109 | 	(if (>= i lim) (return-from m n))
110 | 	(if (not (consonantp str i)) (return nil))
111 | 	(incf i))
112 |       (incf i))))
113 | 
114 | ;; vowelinstem() is TRUE <=> k0,...j contains a vowel
115 | 
116 | (defun vowelinstem (str)
117 |   (loop for i from 0 below (fill-pointer str)
118 |       unless (consonantp str i) return t))
119 | 
120 | ;; doublec(j) is TRUE <=> j,(j-1) contain a double consonant.
121 | 
122 | (defun doublec (str i)
123 |   (cond ((< i 1) nil)
124 | 	((not (eql (char str i) (char str (1- i)))) nil)
125 | 	(t (consonantp str i))))
126 | 
127 | ;; cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
128 | ;; and also if the second c is not w,x or y. this is used when trying to
129 | ;; restore an e at the end of a short word. e.g.
130 | 
131 | ;;    cav(e), lov(e), hop(e), crim(e), but
132 | ;;    snow, box, tray.
133 | 
134 | (defun cvc (str lim)
135 |   (decf lim)
136 |   (if (or (< lim 2)
137 | 	  (not (consonantp str lim))
138 | 	  (consonantp str (1- lim))
139 | 	  (not (consonantp str (- lim 2))))
140 |       (return-from cvc nil))
141 |   (if (member (char str lim) '(#\w #\x #\y)) (return-from cvc nil))
142 |   t)
143 | 
144 | ;; ends(s) is TRUE <=> k0,...k ends with the string s.
145 | 
146 | (defun ends (str ending)
147 |   (declare (string str) (simple-string ending))
148 |   (let ((len1 (length str)) (len2 (length ending)))
149 |     (loop
150 | 	for pa downfrom (1- len1) to 0
151 | 	and pb downfrom (1- len2) to 0
152 | 	unless (eql (char str pa) (char ending pb))
153 | 	return nil
154 | 	finally (return (when (< pb 0)
155 | 			  (decf (fill-pointer str) len2)
156 | 			  t)))))
157 | 
158 | ;; setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
159 | 
160 | (defun setto (str suffix)
161 |   (declare (string str) (simple-string suffix))
162 |   (loop for char across suffix
163 |       do (vector-push-extend char str)))
164 | 
165 | ;; r(s) is used further down.
166 | 
167 | (defun r (str s sfp)
168 |   (if (> (m str (fill-pointer str)) 0)
169 |       (setto str s)
170 |     (setf (fill-pointer str) sfp)))
171 | 
172 | ;; step1ab() gets rid of plurals and -ed or -ing. e.g.
173 | 
174 | ;;     caresses  ->  caress
175 | ;;     ponies    ->  poni
176 | ;;     ties      ->  ti
177 | ;;     caress    ->  caress
178 | ;;     cats      ->  cat
179 | 
180 | ;;     feed      ->  feed
181 | ;;     agreed    ->  agree
182 | ;;     disabled  ->  disable
183 | 
184 | ;;     matting   ->  mat
185 | ;;     mating    ->  mate
186 | ;;     meeting   ->  meet
187 | ;;     milling   ->  mill
188 | ;;     messing   ->  mess
189 | 
190 | ;;     meetings  ->  meet
191 | 
192 | (defun step1ab (str)
193 |   (when (eql (char str (1- (fill-pointer str))) #\s)
194 |     (cond ((ends str "sses") (incf (fill-pointer str) 2))
195 | 	  ((ends str "ies")  (setto str "i"))
196 | 	  ((not (eql (char str (- (fill-pointer str) 2)) #\s)) (decf (fill-pointer str)))))
197 |   (cond ((ends str "eed") (if (> (m str (fill-pointer str)) 0)
198 | 			      (incf (fill-pointer str) 2)
199 | 			    (incf (fill-pointer str) 3)))
200 | 	((let ((sfp (fill-pointer str)))
201 | 	   (if (or (ends str "ed")
202 | 		   (ends str "ing"))
203 | 	       (if (vowelinstem str)
204 | 		   t
205 | 		 (progn (setf (fill-pointer str) sfp)
206 | 			nil))))
207 | 	 (cond ((ends str "at") (setto str "ate"))
208 | 	       ((ends str "bl") (setto str "ble"))
209 | 	       ((ends str "iz") (setto str "ize"))
210 | 	       ((doublec str (1- (fill-pointer str)))
211 | 		(unless (member (char str (1- (fill-pointer str))) '(#\l #\s #\z))
212 | 		  (decf (fill-pointer str))))
213 | 	       (t (if (and (= (m str (fill-pointer str)) 1)
214 | 			   (cvc str (fill-pointer str)))
215 | 		      (setto str "e"))))))
216 |   str)
217 | 
218 | ;; step1c() turns terminal y to i when there is another vowel in the stem.
219 | 
220 | (defun step1c (str)
221 |   (let ((saved-fill-pointer (fill-pointer str)))
222 |     (when (and (ends str "y")
223 | 	       (vowelinstem str))
224 | 	(setf (char str (fill-pointer str)) #\i))
225 |     (setf (fill-pointer str) saved-fill-pointer))
226 |   str)
227 | 
228 | ;; step2() maps double suffices to single ones. so -ization ( = -ize plus
229 | ;; -ation) maps to -ize etc. note that the string before the suffix must give
230 | ;; m() > 0.
231 | 
232 | (defun step2 (str)
233 |   (let ((sfp (fill-pointer str)))
234 |     (when (> sfp 2)
235 |       (block nil
236 | 	(case (char str (- (length str) 2))
237 | 	  (#\a (when (ends str "ational") (r str "ate"  sfp)  (return))
238 | 	       (when (ends str "tional")  (r str "tion" sfp) (return)))
239 | 	  (#\c (when (ends str "enci")    (r str "ence" sfp) (return))
240 | 	       (when (ends str "anci")    (r str "ance" sfp) (return)))
241 | 	  (#\e (when (ends str "izer")    (r str "ize"  sfp)  (return)))
242 | 	  (#\l (when (ends str "bli")     (r str "ble"  sfp)  (return))
243 | 	       ;; -DEPARTURE-
244 | 	       ;; To match the published algorithm, replace prev line with
245 | 	       ;; ((when (ends str "abli")    (r str "able" sfp) (return))
246 | 	       (when (ends str "alli")    (r str "al"  sfp)   (return))
247 | 	       (when (ends str "entli")   (r str "ent" sfp)  (return))
248 | 	       (when (ends str "eli")     (r str "e"   sfp)    (return))
249 | 	       (when (ends str "ousli")   (r str "ous" sfp)  (return)))
250 | 	  (#\o (when (ends str "ization") (r str "ize" sfp)  (return))
251 | 	       (when (ends str "ation")   (r str "ate" sfp)  (return))
252 | 	       (when (ends str "ator")    (r str "ate" sfp)  (return)))
253 | 	  (#\s (when (ends str "alism")   (r str "al"  sfp)   (return))
254 | 	       (when (ends str "iveness") (r str "ive" sfp)  (return))
255 | 	       (when (ends str "fulness") (r str "ful" sfp)  (return))
256 | 	       (when (ends str "ousness") (r str "ous" sfp)  (return)))
257 | 	  (#\t (when (ends str "aliti")   (r str "al"  sfp)   (return))
258 | 	       (when (ends str "iviti")   (r str "ive" sfp)  (return))
259 | 	       (when (ends str "biliti")  (r str "ble" sfp)  (return)))
260 | 	  ;; -DEPARTURE-
261 | 	  ;; To match the published algorithm, delete next line.
262 | 	  (#\g (when (ends str "logi")    (r str "log" sfp)  (return)))))))
263 |   str)
264 | 
265 | ;; step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
266 | 
267 | (defun step3 (str)
268 |   (let ((sfp (fill-pointer str)))
269 |     (block nil
270 |       (case (char str (1- (length str)))
271 | 	(#\e (when (ends str "icate") (r str "ic" sfp) (return))
272 | 	     (when (ends str "ative") (r str "" sfp)   (return)) ; huh?
273 | 	     (when (ends str "alize") (r str "al" sfp) (return)))
274 | 	(#\i (when (ends str "iciti") (r str "ic" sfp) (return)))
275 | 	(#\l (when (ends str "ical")  (r str "ic" sfp) (return))
276 | 	     (when (ends str "ful")   (r str "" sfp)   (return))) ; huh?
277 | 	(#\s (when (ends str "ness")  (r str "" sfp)   (return))) ; huh?
278 | 	)))
279 |   str)
280 | 
281 | ;; step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
282 | 
283 | (defun step4 (str)
284 |   (let ((sfp (fill-pointer str)))
285 |     (when (> sfp 2)			; Unnecessary?
286 |       (block nil
287 | 	(case (char str (- sfp 2))
288 | 	  (#\a (if (ends str "al")    (return)))
289 | 	  (#\c (if (ends str "ance")  (return))
290 | 	       (if (ends str "ence")  (return)))
291 | 	  (#\e (if (ends str "er")    (return)))
292 | 	  (#\i (if (ends str "ic")    (return)))
293 | 	  (#\l (if (ends str "able")  (return))
294 | 	       (if (ends str "ible")  (return)))
295 | 	  (#\n (if (ends str "ant")   (return))
296 | 	       (if (ends str "ement") (return))
297 | 	       (if (ends str "ment")  (return))
298 | 	       (if (ends str "ent")   (return)))
299 | 	  (#\o (if (ends str "ion")
300 | 		   (let ((len (length str)))
301 | 		     (if (and (> len 0)
302 | 			      (let ((c (char str (1- len))))
303 | 				(or (eql c #\s) (eql c #\t))))
304 | 			 (return)
305 | 		       (setf (fill-pointer str) sfp))))
306 | 	       (if (ends str "ou")    (return))) ; takes care of -ous
307 | 	  (#\s (if (ends str "ism")   (return)))
308 | 	  (#\t (if (ends str "ate")   (return))
309 | 	       (if (ends str "iti")   (return)))
310 | 	  (#\u (if (ends str "ous")   (return)))
311 | 	  (#\v (if (ends str "ive")   (return)))
312 | 	  (#\z (if (ends str "ize")   (return))))
313 | 	(return-from step4 str))
314 |       (unless (> (m str (fill-pointer str)) 1)
315 | 	(setf (fill-pointer str) sfp)))
316 |     str))
317 | 
318 | ;; step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
319 | 
320 | (defun step5 (str)
321 |   (let ((len (fill-pointer str)))
322 |     (if (eql (char str (1- len)) #\e)
323 | 	(let ((a (m str len)))
324 | 	  (if (or (> a 1)
325 | 		  (and (= a 1)
326 | 		       (not (cvc str (1- len)))))
327 | 	      (decf (fill-pointer str))))))
328 |   (let ((len (fill-pointer str)))
329 |     (if (and (eql (char str (1- len)) #\l)
330 | 	     (doublec str (1- len))
331 | 	     (> (m str len) 1))
332 | 	(decf (fill-pointer str))))
333 |   str)
334 | 
335 | ;; In stem(p,i,j), p is a char pointer, and the string to be stemmed is from p[i] to p[j]
336 | ;; inclusive. Typically i is zero and j is the offset to the last character of a string,
337 | ;; (p[j+1] == '\0'). The stemmer adjusts the characters p[i] ... p[j] and returns the new
338 | ;; end-point of the string, k.  Stemming never increases word length, so i <= k <= j. To
339 | ;; turn the stemmer into a module, declare 'stem' as extern, and delete the remainder of
340 | ;; this file.
341 | 
342 | (defun stem (str)
343 |   (let ((len (length str)))
344 |     ;; With this line, strings of length 1 or 2 don't go through the
345 |     ;; stemming process, although no mention is made of this in the
346 |     ;; published algorithm. Remove the line to match the published
347 |     ;; algorithm.
348 |     (if (<= len 2) (return-from stem str)) ; /*-DEPARTURE-*/
349 |     (if (typep str 'simple-string)	; Primarily for testing.
350 | 	(setf str
351 | 	  (make-array len :element-type 'character
352 | 		      :fill-pointer len :initial-contents str)))
353 |     (step1ab str) (step1c str) (step2 str) (step3 str) (step4 str) (step5 str)
354 |     str))
355 | 
356 | #+never
357 | (trace step1ab step1c step2 step3 step4 step5)
358 | 
359 | #+never
360 | (defun test ()				; Run against the distributed test files.
361 |   (with-open-file (f1 "voc.txt" :external-format :ascii)
362 |     (with-open-file (f2 "output.txt" :external-format :ascii)
363 |       (loop as w1 = (read-line f1 nil nil)
364 | 	  while w1
365 | 	  as w2 = (read-line f2 nil nil)
366 | 	  as w3 = (stem w1)
367 | 	  if (equal w2 w3)
368 | 	  count t into successes
369 | 	  else count t into failures
370 | 	  and do (format t "(stem ~s) => ~s wanted ~s~%" w1 w3 w2)
371 | 	  finally (progn (format t "sucesses ~d failures ~d~%" successes failures)
372 | 			 (return failures))))))
373 | 


--------------------------------------------------------------------------------
/src/regex-tokenize.lisp:
--------------------------------------------------------------------------------
 1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
 2 | ;;;; *************************************************************************
 3 | ;;;; FILE IDENTIFICATION
 4 | ;;;;
 5 | ;;;; Name:          regex-tokenize
 6 | ;;;; Purpose:       Simple regex based tokenizer for natural language pre-tagging
 7 | ;;;;                
 8 | ;;;; Programmer:    Ian S. Eslick
 9 | ;;;; Date Started:  September 2004
10 | ;;;;
11 | 
12 | (in-package :langutils)
13 | 
14 | (defun tokenize-file (filename))
15 | 
16 | (defparameter known-abbreviations 
17 |   '("Apr" "Assn" "Aug" "Av" "Ave" "Bldg" "Cf" "Co"
18 |     "Corp" "Ct" "Dec" "Dept" "Dist" "Dr" "Eq" "Feb" "Fig" "Figs"
19 |     "Gov" "Inc" "Jan" "Jas" "Jr" "Jul" "Jun" "Lt" "Ltd" "MM"
20 |     "Mar" "May" "Mfg" "Mme" "Mr" "Mrs" "Ms" "Msec" "Mt" "Mts"
21 |     "No" "Nov" "Oct" "Op" "Rd" "Rep" "Rte" "Sen" "Sep" "Sr"
22 |     "St" "Stat" "Tech" "USN" "Vol" "Vs" "Yo" "a" "al" "apr"
23 |     "aug" "bur" "ca" "cc" "cf" "cf" "cm" "cu" "dec" "dia" "ed"
24 |     "eds" "eg" "eqn" "eqns" "est" "etc" "ex" "feb" "fig" "figs"
25 |     "ft" "gm" "hp" "hr" "jan" "jul" "jun" "kc" "l" "lb" "lbs"
26 |     "m" "mEq" "mar" "may" "mc" "mg" "mil" "min" "ml" "mm" "mos"
27 |     "nov" "nw" "oct" "oz" "p" "pl" "pp" "prop" "r" "repr" "rev"
28 |     "sec" "sep" "sq" "trans" "v" "var" "viz" "vol" "vols" "vs"
29 |     "yd" "yrs"))
30 | 
31 | (defparameter compiled-abbrevs
32 |   (loop for abbrev in known-abbreviations
33 |     nconcing (list (pregex:create-scanner 
34 | 		    (concatenate 'string " +(" abbrev ")\\. +")))))
35 | 
36 | (defparameter multi-word-exp 
37 |   '("a bit" "according to" "all of a sudden" "at large" "at last"
38 |     "from time to time" "given that" "in addition to" "in addition"
39 |     "in back of" "in between" "in brief" "inside out" "kung fu"
40 |     "let 's" "no doubt" "no longer" "no matter" "none the less"
41 |     "okey dokey" "old fashioned" "one another" "per cent" "per diem"
42 |     "provided that" "providing that" "spot on" "time and again"
43 |     "up to" "up to date" "upside down" "whether or not"))
44 | 
45 | (defparameter filters
46 |   (list (make-regex-replace-filter "\\n" " " "Erase line boundaries")
47 | 	(make-regex-replace-filter "([][?!()\";{}])" " \\1 " "Punctuation that is always solo")
48 | 	;; apostrophes
49 | 	(make-regex-replace-filter "'" " '" "Apostrophes")
50 | 	(make-regex-replace-filter "([A-Za-z0-9]) '(s)" "\\1'\\2" "Apostrophes, recover 1")
51 | 	(make-regex-replace-filter "(n) '(t)" " \\1'\\2" "Apostrophes, recover 2")
52 | 	;; punctuation in numbers
53 | 	(make-regex-replace-filter "([0-9])([:,/])([0-9])" "\\1@\\2\\3" "Number punctuation 1")
54 | 	(make-regex-replace-filter "([^@])([:,/])" "\\1 \\2 " "Number punctuation 2")
55 | 	(make-regex-replace-filter "@" "" "Get rid of @'s")
56 | 	;; word-final periods
57 | 	(make-regex-replace-filter "\\b([A-Za-z])\\. " "\\1 " "ie: G. Gordon Liddy")
58 | 	(make-regex-replace-filter "(\\.[A-Za-z]+)\\. " "\\1 " "ie: U.S. i.e. m.p.h.")
59 | 	(make-regex-replace-filter "\\. *([,;])" " \\1" "ie: Prop., but_")
60 | 	(make-regex-replace-filter "\\. +([a-z])" " \\1" "ie: cm. in diameter")
61 | 	;; Allow abbreviations with periods
62 | 	#'(lambda (text &key print)
63 | 	    (when print (print "Finding abbreviations..."))
64 | 	    (loop for abbrev in compiled-abbrevs do
65 | 	      (setf text (pregex:regex-replace-all abbrev text "\\1@ ")))
66 | 	    text)
67 | 	;; add line breaks
68 | 	(make-regex-replace-filter "([.?!]) +" " \\1
69 | " "Add a line break.")
70 | 	(make-regex-replace-filter "@" "." "")
71 | 	;; normalize spaces
72 | 	(make-regex-replace-filter " +" " " "")
73 | 	;; multiword expressions
74 | 	(make-regex-replace-filter 
75 | 	 (concatenate 'string "\\b(" (merge-or-regex-strings multi-word-exp) ")\\b")
76 | 	 "<\\1>" "Wrap multi-word expressions: <m w e>")
77 | 	(make-regex-replace-filter " (?=[^<]+>)" "_" "<m_w_e>")
78 | 	(make-regex-replace-filter "<>" "" "m_w_e")))
79 | 
80 | 
81 | ;;(defun-exported tokenize-string (string &key debug)
82 | ;;  (filter-text string filters :debug debug))
83 | 
84 | (defun tokenize-file (source target &key full)
85 |   (with-open-file (src source )
86 |     (with-open-file (targ target :direction :output :if-exists :supersede)
87 |       (if full 
88 | 	  (write-string 
89 | 	   (tokenize-string 
90 | 	    (read-file-to-string src))
91 | 	   targ)
92 | 	(do-contentful-lines (line src)
93 | 	  (write-line (tokenize-string line) targ))))))
94 | 


--------------------------------------------------------------------------------
/src/stopwords.lisp:
--------------------------------------------------------------------------------
 1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
 2 | ;;;; *************************************************************************
 3 | ;;;; FILE IDENTIFICATION
 4 | ;;;;
 5 | ;;;; Name:          stopwords
 6 | ;;;; Purpose:       Simple db for stopword identity, all tokenized
 7 | ;;;;
 8 | ;;;; Programmer:    Ian S. Eslick
 9 | ;;;; Date Started:  October 2004
10 | ;;;;
11 | 
12 | ;;; stopwords vs concise-stopwords: stopwords is a wide list of words.
13 | ;;;                                 concise-stopwords are a *very* small list of words. Mainly pronounds and determiners
14 | (in-package :langutils)
15 | 
16 | ;; =================================
17 | ;; Stopword Database and Processing
18 | ;; =================================
19 | 
20 | (defvar *stopwords* nil)
21 | (defvar *concise-stopwords* nil)
22 | 
23 | (defun init-stopwords (&optional path)
24 |   (when (null path) (setf path "~/Work/fsrc/lisp/langutils/data/stopwords.txt"))
25 |   (setf *stopwords* (make-hash-table :test #'equal :size 1000))
26 |   (init-word-test)
27 |   (with-open-file (f path :external-format :ascii)
28 |     (do-contentful-lines (line f)
29 |       (hash-put *stopwords* (id-for-token (string-trim " " line)) t))))
30 | 
31 | (defun init-concise-stopwords (&optional path)
32 |   (when (null path) (setf path  (translate-logical-pathname "think:data;lang;en;langutils;concise-stopwords.txt")))
33 |   (setf *concise-stopwords* (make-hash-table :test #'equal :size 10))
34 |   (with-open-file (f path :external-format :ascii)
35 |     (do-contentful-lines (line f)
36 |       (hash-put *concise-stopwords* (id-for-token (string-trim " " line)) t))))
37 | 
38 | 
39 | (defun clean-stopwords ()
40 |   (setf *stopwords* nil)
41 |   (setf *concise-stopwords* nil))
42 | 
43 | (defun stopword? (id)
44 |   "Identifies id as a 'stopword'"
45 |   (hash-get *stopwords* id))
46 | 
47 | (defun concise-stopword? (id)
48 |   "Identifies id as a 'concise-stopword' word.
49 |    concise-stopwords are a *very* small list of words. Mainly pronouns and determiners"
50 |   (hash-get *concise-stopwords* id))
51 | 
52 | 
53 | (defvar *is-token* nil)
54 | (defvar *s-token* nil)
55 | 
56 | (defun init-word-test ()
57 |   (declare (special *is-token* *s-token*))
58 |   (setf *is-token* (id-for-token "is"))
59 |   (setf *s-token* (id-for-token "'s")))
60 | 
61 | (defun contains-is? (ids)
62 |   "Tests list of ids for 'is' words"
63 |   (declare (special *is-token* *s-token*))
64 |   (find-if (lambda (id) (or (eq *is-token* id)
65 | 			    (eq *s-token* id)))
66 | 	   ids))
67 | 
68 | ;; --------------------------
69 | 
70 | (defun string-stopword? (word)
71 |   (stopword? (id-for-token word)))
72 | 
73 | (defun string-concise-stopword? (word)
74 |   "Check the word if it is a 'concise-stopword' word.
75 |    concise-stopwords are a *very* small list of words. Mainly pronouns and determiners"
76 |   (concise-stopword? (id-for-token word)))
77 | 
78 | (defun string-contains-is? (words)
79 |   "Checks the list for a string containing 'is'"
80 |   (contains-is? (mapcar #'id-for-token words)))
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/src/tagger-data.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          tagger-data
  6 | ;;;; Purpose:       Lisp version of the Brill tagger w/ WSJ/BROWN data; implements
  7 | ;;;;                the data file loading and representation
  8 | ;;;;
  9 | ;;;; Programmer:    Ian S. Eslick
 10 | ;;;; Date Started:  October 2004
 11 | ;;;;
 12 | 
 13 | (in-package :langutils)
 14 | 
 15 | ;; -----------------------------------
 16 | ;; Contextual rule files
 17 | 
 18 | ;;
 19 | ;; The parser builds rules from tokenized rule expressions of the form:
 20 | ;; <oldtag> <newtag> RULE-TYPE <arg1> <arg2>
 21 | ;;
 22 | ;; Where <arg1> can be tag1 or word1 with semantics as described
 23 | ;; in *contextual-rule-args* above; similarly for <arg2>
 24 | ;;
 25 | (def-contextual-rule-parser make-contextual-rule
 26 |   ("SURROUNDTAG" (match (0 oldtag) (-1 tag1) (+1 tag2)) => newtag)
 27 |   ("NEXTTAG" (match (0 oldtag) (+1 tag1)) => newtag)
 28 |   ("CURWD" (match (0 oldtag) (0 word1)) => newtag)
 29 |   ("NEXTWD" (match (0 oldtag) (+1 word1)) => newtag)
 30 |   ("RBIGRAM" (match (0 oldtag) (0 word1) (+1 word2)) => newtag)
 31 |   ("WDNEXTTAG" (match (0 oldtag) (0 word1) (+1 tag2)) => newtag)
 32 |   ("WDAND2AFT" (match (0 oldtag) (0 word1) (+2 word2)) => newtag)
 33 |   ("WDAND2TAGAFT" (match (0 oldtag) (0 word1) (+2 tag2)) => newtag)
 34 |   ("NEXT2TAG" (match (0 oldtag) (+2 tag1)) => newtag)
 35 |   ("NEXT2WD" (match (0 oldtag) (+2 word1)) => newtag)
 36 |   ("NEXTBIGRAM" (match (0 oldtag) (+1 tag1) (+2 tag2)) => newtag)
 37 |   ("NEXT1OR2TAG" (match (0 oldtag) (or (+1 tag1) (+2 tag1))) => newtag)
 38 |   ("NEXT1OR2WD" (match (0 oldtag) (or (+1 word1) (+2 word1))) => newtag)
 39 |   ("NEXT1OR2OR3TAG" (match (0 oldtag) (or (+1 tag1) (+2 tag1) (+3 tag1))) => newtag)
 40 |   ("PREVTAG" (match (0 oldtag) (-1 tag1)) => newtag)
 41 |   ("PREVWD" (match (0 oldtag) (-1 word1)) => newtag)
 42 |   ("LBIGRAM" (match (0 oldtag) (0 word1) (-1 word2)) => newtag)
 43 |   ("WDPREVTAG" (match (0 oldtag) (0 word1) (-1 tag2)) => newtag)
 44 |   ("WDAND2BFR" (match (0 oldtag) (0 word1) (-2 word2)) => newtag)
 45 |   ("WDAND2TAGBFR" (match (0 oldtag) (0 word1) (-2 tag2)) => newtag)
 46 |   ("PREV2TAG" (match (0 oldtag) (-2 tag1)) => newtag)
 47 |   ("PREV2WD" (match (0 oldtag) (-2 word1)) => newtag)
 48 |   ("PREV1OR2TAG" (match (0 oldtag) (or (-1 tag1) (-2 tag1))) => newtag)
 49 |   ("PREV1OR2WD" (match (0 oldtag) (or (-1 word1) (-2 word1))) => newtag)
 50 |   ("PREV1OR2OR3TAG" (match (0 oldtag) (or (-1 tag1) (-2 tag1) (-3 tag1))) => newtag)
 51 |   ("PREV1OR2OR3WD" (match (0 oldtag) (or (-1 word1) (-2 word1) (-3 word1))) => newtag)
 52 |   ("PREVBIGRAM" (match (0 oldtag) (-1 tag2) (-2 tag1)) => newtag))
 53 | 
 54 | #|
 55 | Macroexpand this to see code for a single rule closure:
 56 | (def-contextual-rule-parser make-contextual-rule
 57 |   ("SURROUNDTAG" (match (0 oldtag) (-1 tag1) (+1 tag2)) => newtag))
 58 | |#
 59 | 
 60 | (defun load-contextual-rules ( rule-file &aux rules )
 61 |   (write-log tagger-init "Loading contextual rules")
 62 |   (with-open-file ( s rule-file :external-format :ascii)
 63 |     (do-contentful-lines (line s ) ;; :count count)
 64 |       (let* ((context-rule (ppcre:split "\\s+" line))
 65 | 	     (closure (make-contextual-rule context-rule)))
 66 | 	(if closure (push closure rules)))))
 67 |   (nreverse rules))
 68 | 
 69 | 
 70 | ;; -----------------------------------------
 71 | ;; LEXICAL RULE FILE INITIALIZATION AND I/F
 72 | 
 73 | (defun load-lexical-rules ( rule-file &optional bigram-hash word-hash &aux (rule-list nil))
 74 |   "Return a list of closure implementing the lexical rules
 75 |    in rule-file to tag words not found in the lexicon"
 76 |   (write-log tagger-init "Loading lexical rules")
 77 |   (with-open-file ( s rule-file :external-format :ascii)
 78 |     (do-contentful-lines (line s)
 79 |       (let ((lex-rule (ppcre:split "\\s+" line)))
 80 | 	(let ((rule (make-lexical-rule lex-rule *lexicon* bigram-hash word-hash)))
 81 | 	  (if rule (push rule rule-list))))))
 82 |   (nreverse rule-list))
 83 | 
 84 | (let ((pair (cons nil nil)))
 85 |   (defun guess-tag ( token initial-tag rule-list )
 86 |     "Using rules in rule-table guess the tag of the token 'token'"
 87 |     (declare (optimize (speed 3) (safety 0) (debug 0) (space 0)))
 88 |     (setf (car pair) token)
 89 |     (setf (cdr pair) initial-tag)
 90 |     (aif (apply-rules pair rule-list)
 91 | 	 (cdr it)
 92 | 	 initial-tag)))
 93 | 
 94 | ;; Expanded implementation, not great but was fast and easy...
 95 | ;;
 96 | ;; Example of lexical rule file format:
 97 | ;;   NN would fgoodright VB x
 98 | ;;   NN 0 fchar CD x
 99 | ;;   NN be fgoodright JJ x
100 | (defun make-lexical-rule (list lh bh wh)
101 |   "Look through list for rule name"
102 |   (declare (ignore bh)
103 | ;;	   (inline strncmp list eq equal char subseq strncmp-end2 length concatenate)
104 | 	   (optimize (speed 0) (safety 3) (debug 3)))
105 |   (let ((name (second list))
106 | 	(fname (third list)))
107 |     (cond ((string= name "char")
108 | 	   (let ((prefix (char (first list) 0)) 
109 | 		 (new-tag (mkkeysym (third list))))
110 | 	     (declare (type character prefix)
111 | 		      (type symbol new-tag))
112 | 	     #'(lambda (pair)
113 | 		 (declare (type cons pair));; (inline char))
114 | 		 (let ((token (car pair))
115 | 		       (tag (cdr pair)))
116 | 		   (declare (type string token) (type symbol tag) (ignore tag))
117 | 		   (if (eq prefix (char token 0))
118 | 		       (progn (setf (cdr pair) new-tag) pair)
119 | 		     pair)))))
120 | 	  ((string= fname "fchar")
121 | 	   (let ((old-tag (mkkeysym (first list))) 
122 | 		 (prefix (char (second list) 0)) 
123 | 		 (new-tag (mkkeysym (fourth list))))
124 | 	     (declare (type symbol old-tag new-tag) (type character prefix))
125 | 	     #'(lambda (pair)
126 | 		 (declare (type cons pair));; (inline char))
127 | 		 (let ((token (car pair))
128 | 		       (tag (cdr pair)))
129 | 		   (declare (type string token) (type symbol tag))
130 | 		   (if (and (eq tag old-tag)
131 | 			    (eq prefix (char token 0)))
132 | 		       (progn (setf (cdr pair) new-tag) pair)
133 | 		     pair)))))
134 | 	  ((string= name "deletepref")
135 | 	   (let ((prefix (first list))
136 | 		 (count (read-from-string (third list)))
137 | 		 (new-tag (mkkeysym (fourth list))))
138 | 	     (declare  #-mcl (type symbol new-tag)
139 | 		       #-mcl (type fixnum count)
140 | 		       #-mcl (type string prefix))
141 | 	     #'(lambda (pair)
142 | 		 (declare #-mcl (type cons pair))
143 | 		 (let ((token (car pair))
144 | 		       (tag (cdr pair)))
145 | 		   (declare (type string token) (type symbol tag) (ignore tag))
146 | 		   (if (strncmp prefix token count 0)
147 | 		       (let ((root (subseq token count)))
148 | 		       (if (or (hash-get lh root)
149 | 			       (and wh (hash-get wh root)))
150 | 			   (progn (setf (cdr pair) new-tag) pair)
151 | 			 pair))
152 | 		   pair)))))
153 | 	  ((string= fname "fdeletepref")
154 | 	   (let ((old-tag (mkkeysym (first list)))
155 | 		 (prefix (second list))
156 | 		 (count (read-from-string (fourth list)))
157 | 		 (new-tag (mkkeysym (fifth list))))
158 | 	     (declare  #-mcl (type symbol old-tag new-tag)
159 | 		       #-mcl (type fixnum count)
160 | 		       #-mcl (type string prefix))
161 | 	     #'(lambda (pair)
162 | 		 (declare  #-mcl (type cons pair))
163 | 		 (let ((token (car pair))
164 | 		       (tag (cdr pair)))
165 | 		   (declare (type string token) (type symbol tag))
166 | 		   (if (and (eq tag old-tag)
167 | 			    (strncmp prefix token count 0))
168 | 		     (let ((root (subseq token count)))
169 | 		       (if (or (hash-get lh root)
170 | 			       (and wh (hash-get wh root)))
171 | 			   (progn (setf (cdr pair) new-tag) pair)
172 | 			 pair))
173 | 		   pair)))))
174 | 	  ((string= name "haspref")
175 | 	   (let ((prefix (first list))
176 | 		 (count (read-from-string (third list)))
177 | 		 (new-tag (mkkeysym (fourth list))))
178 | 	     (declare  #-mcl (type symbol new-tag)
179 | 		       #-mcl (type fixnum count)
180 | 		       #-mcl (type string prefix))
181 | 	     #'(lambda (pair)
182 | 		 (declare #-mcl (type cons pair))
183 | 		 (let ((token (car pair))
184 | 		       (tag (cdr pair)))
185 | 		   (declare  #-mcl (type string token) 
186 | 			     #-mcl (type symbol tag)
187 | 			     (ignore tag))
188 | 		   (if (strncmp prefix token count 0)
189 | 		       (progn (setf (cdr pair) new-tag) pair)
190 | 		   pair)))))
191 | 	  ((string= fname "fhaspref")
192 | 	   (let ((old-tag (mkkeysym (first list)))
193 | 		 (prefix (second list))
194 | 		 (count (read-from-string (fourth list)))
195 | 		 (new-tag (mkkeysym (fifth list))))
196 | 	     (declare  #-mcl (type symbol new-tag old-tag)
197 | 		       #-mcl (type fixnum count)
198 | 		       #-mcl (type string prefix))
199 | 	     #'(lambda (pair)
200 | 		 (declare (type cons pair))
201 | 		 (let ((token (car pair))
202 | 		       (tag (cdr pair)))
203 | 		   (declare #-mcl (type string token) 
204 | 			    #-mcl (type symbol tag))
205 | 		   (if (and (eq tag old-tag)
206 | 			    (strncmp prefix token count 0))
207 | 		     (progn (setf (cdr pair) new-tag) pair)
208 | 		   pair)))))
209 | 	  ((string= name "deletesuf")
210 | 	   (let ((suffix (first list))
211 | 		 (count (read-from-string (third list)))
212 | 		 (new-tag (mkkeysym (fourth list))))
213 | 	     (declare  #-mcl (type symbol new-tag)
214 | 		       #-mcl (type fixnum count)
215 | 		       #-mcl (type string suffix))
216 | 	     #'(lambda (pair)
217 | 		 (declare  #-mcl (type cons pair))
218 | 		 (let ((token (car pair))
219 | 		       (tag (cdr pair)))
220 | 		   (declare #-mcl (type string token) 
221 | 			    #-mcl (type symbol tag)
222 | 			    (ignore tag))
223 | 		   (if (strncmp-end2 suffix token count (- (length token) count))
224 | 		       (let ((root (subseq token 0 (- (length token) count))))
225 | 		       (if (or (hash-get lh root)
226 | 			       (and wh (hash-get wh root)))
227 | 			   (progn (setf (cdr pair) new-tag) pair)
228 | 			 pair))
229 | 		   pair)))))
230 | 	  ((string= fname "fdeletesuf")
231 | 	   (let ((old-tag (mkkeysym (first list)))
232 | 		 (suffix (second list))
233 | 		 (count (read-from-string (fourth list)))
234 | 		 (new-tag (mkkeysym (fifth list))))
235 | 	     (declare #-mcl (type symbol new-tag old-tag)
236 | 		      #-mcl (type fixnum count)
237 | 		      #-mcl (type string suffix))
238 | 	     #'(lambda (pair)
239 | 		 (declare #-mcl (type cons pair))
240 | 		 (let ((token (car pair))
241 | 		       (tag (cdr pair)))
242 | 		   (declare #-mcl (type string token) 
243 | 			    #-mcl (type symbol tag))
244 | 		   (if (and (eq tag old-tag)
245 | 			    (strncmp-end2 suffix token count (- (length token) count)))
246 | 		     (let ((root (subseq token 0 (- (length token) count))))
247 | 		       (if (or (hash-get lh root)
248 | 			       (and wh (hash-get wh root)))
249 | 			   (progn (setf (cdr pair) new-tag) pair)
250 | 			 pair))
251 | 		   pair)))))
252 | 	  ((string= name "hassuf")
253 | 	   (let ((suffix (first list))
254 | 		 (count (read-from-string (third list)))
255 | 		 (new-tag (mkkeysym (fourth list))))
256 | 	     (declare #-mcl (type symbol new-tag)
257 | 		      #-mcl (type fixnum count)
258 | 		      #-mcl (type string suffix))
259 | 	     #'(lambda (pair)
260 | 		 (declare #-mcl (type cons pair))
261 | 		 (let ((token (car pair))
262 | 		       (tag (cdr pair)))
263 | 		   (declare #-mcl (type string token) 
264 | 			    #-mcl (type symbol tag)
265 | 			    (ignore tag))
266 | 		   (if (strncmp-end2 suffix token count (- (length token) count))
267 | 		       (progn (setf (cdr pair) new-tag) pair)
268 | 		   pair)))))
269 | 	  ((string= fname "fhassuf")
270 | 	   (let ((old-tag (mkkeysym (first list)))
271 | 		 (suffix (second list))
272 | 		 (count (read-from-string (fourth list)))
273 | 		 (new-tag (mkkeysym (fifth list))))
274 | 	     (declare #-mcl (type symbol new-tag old-tag)
275 | 		      #-mcl (type fixnum count)
276 | 		      #-mcl (type string suffix))
277 | 	     #'(lambda (pair)
278 | 		 (declare  #-mcl (type cons pair))
279 | 		 (let ((token (car pair))
280 | 		       (tag (cdr pair)))
281 | 		   (declare #-mcl (type string token) 
282 | 			    #-mcl (type symbol tag))
283 | 		   (if (and (eq tag old-tag)
284 | 			    (strncmp-end2 suffix token count (- (length token) count)))
285 | 		       (progn (setf (cdr pair) new-tag) pair)
286 | 		     pair)))))
287 | 	  ((string= name "addsuf") 
288 | 	   (let ((suffix (first list))
289 | ;;		 (count (read-from-string (third list)))
290 | 		 (new-tag (mkkeysym (fourth list))))
291 | 	     (declare  #-mcl (type symbol new-tag)
292 | ;                      #-mcl (type fixnum count)
293 | 		       #-mcl (type string suffix))
294 | 	     #'(lambda (pair)
295 | 		 (declare #-mcl (type cons pair))
296 | 		 (let ((token (car pair))
297 | 		       (tag (cdr pair)))
298 | 		   (declare  #-mcl (type string token) 
299 | 			     #-mcl (type symbol tag)
300 | 			     (ignore tag))
301 | 		   (let ((new-word (concatenate 'string token suffix)))
302 | 		     (if (or (hash-get lh new-word)
303 | 			   (and wh (hash-get wh new-word)))
304 | 			 (progn (setf (cdr pair) new-tag) pair) 
305 | 		       pair))))))
306 | 	  ((string= fname "faddsuf")
307 | 	   (let ((old-tag (mkkeysym (first list)))
308 | 		 (suffix (second list))
309 | ;;		 (count (read-from-string (fourth list)))
310 | 		 (new-tag (mkkeysym (fifth list))))
311 | 	     (declare #-mcl (type symbol new-tag old-tag)
312 | ;;		      #-mcl (type fixnum count)
313 | 		      #-mcl (type string suffix))
314 | 	     #'(lambda (pair)
315 | 		 (declare #-mcl (type cons pair))
316 | 		 (let ((token (car pair))
317 | 		       (tag (cdr pair)))
318 | 		   (declare #-mcl (type string token) 
319 | 			    #-mcl (type symbol tag))
320 | 		   (if (eq tag old-tag)
321 | 		       (let ((new-word (concatenate 'string token suffix)))
322 | 		       (if (or (hash-get lh new-word)
323 | 			       (and wh (hash-get wh new-word)))
324 | 			   (progn (setf (cdr pair) new-tag) pair)
325 | 			 pair))
326 | 		     pair)))))
327 | ;;	  ((string= name "addpref")
328 | ;;	   (let ((prefix (first list))
329 | ;;		 (count (read-from-string (third list)))
330 | ;;		 (new-tag (mkkeysym (fourth list))))
331 | ;;	     #'(lambda (token tag)
332 | ;;	  ((string= fname "faddpref")
333 | ;;	   (let ((old-tag (mkkeysym (first list)))
334 | ;;		 (prefix (second list))
335 | ;;		 (count (read-from-string (fourth list)))
336 | ;;		 (new-tag (mkkeysym (fifth list))))
337 | ;;	     #'(lambda (token tag)
338 | ;; NOTE: NO BIGRAM SUPPORT YET
339 | ;; "goodleft" "fgoodleft" "goodright" "fgoodright"
340 | 	  (t (progn 
341 | 	       (write-log tagger-contextual "No rule found for: ~A." list)
342 | 	       nil)))))
343 | 
344 | 
345 | (defun apply-rules ( datum rule-list )
346 |   "Apply rules to the values in values presuming that
347 |    the returned list is also a list of values that can
348 |    be passed to the next rule"
349 |   (declare (optimize (speed 3) (safety 1) (debug 1) (space 0))
350 | 	   (type list rule-list)
351 | 	   (type list datum))
352 |   (cond ((null rule-list)
353 | 	 datum)
354 | 	((null datum)
355 | 	 (error "Null datum in rule prior to ~A.~%" (car rule-list)))
356 | 	(t (apply-rules (funcall (car rule-list) datum) (cdr rule-list)))))
357 |   
358 | 
359 | 
360 | 
361 | 


--------------------------------------------------------------------------------
/src/tagger-oldctx.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          tagger-ctxold
  6 | ;;;; Purpose:       Non-macro version of context rule parser/generator
  7 | ;;;;
  8 | ;;;; Programmer:    Ian S. Eslick
  9 | ;;;; Date Started:  October 2004
 10 | ;;;;
 11 | 
 12 | (in-package :langutils)
 13 | 
 14 | (defun make-contextual-rule-old ( contextual-rule )
 15 |   (declare (optimize speed (safety 0))
 16 | 	   (type list contextual-rule)
 17 | 	   (inline svref))
 18 |   (let ((old (mkkeysym (first contextual-rule)))
 19 | 	(new (mkkeysym (second contextual-rule)))
 20 | 	(name (string-upcase (third contextual-rule)))
 21 | 	(arg1 (fourth contextual-rule))
 22 | 	(arg2 (fifth contextual-rule)))
 23 |     (cond ((string= name "SURROUNDTAG")
 24 | 	   (let ((t1 (mkkeysym arg1))
 25 | 		 (t2 (mkkeysym arg2)))
 26 | 	     #'(lambda (tokens tags pos)
 27 | 		 (declare (ignore tokens)
 28 | 		          (type simple-vector tags)
 29 | 			  (type fixnum pos)
 30 | 			  (optimize speed (safety 0)))
 31 | 		 (if (and (eq (svref tags pos) old)
 32 | 			  (eq (svref tags (- pos 1)) t1)
 33 | 			  (eq (svref tags (+ pos 1)) t2))
 34 | 		     (progn 
 35 | 		       (write-log tagger-contextual "SURROUNDTAG: ~A @ ~A" contextual-rule pos)
 36 | 		       (setf (svref tags pos) new))))))
 37 | 	  ((string= name "NEXTTAG")
 38 | 	   (let ((t1 (mkkeysym arg1)))
 39 | 	     #'(lambda (tokens tags pos)
 40 | 		 (declare (ignore tokens)
 41 | 			  #-mcl (type (simple-vector symbol ) tags)
 42 | 			  #-mcl (type fixnum pos)
 43 | 			  (optimize speed (safety 0)))
 44 | 		 (if (and (eq (svref tags pos) old)
 45 | 			  (eq (svref tags (+ pos 1)) t1))
 46 | 		     (progn 
 47 | 		       (write-log tagger-contextual "NEXTTAG: ~A @ ~A" contextual-rule pos)
 48 | 		     (setf (svref tags pos) new))))))
 49 | 	  ((string= name "CURWD")
 50 | 	   (let ((w1 (id-for-token arg1)))
 51 | 	     #'(lambda (tokens tags pos)
 52 | 		 (declare  #-mcl (type (simple-array fixnum) tokens)
 53 | 			   #-mcl (type simple-vector tags)
 54 | 			   #-mcl (type fixnum pos)
 55 | 			   (optimize speed (safety 0)))
 56 | 		 (if (and (eq (svref tags pos) old)
 57 | 			  (eq (aref tokens pos) w1))
 58 | 		     (progn 
 59 | 		       (write-log tagger-contextual "CURWD: ~A @ ~A" contextual-rule pos)
 60 | 		       (setf (svref tags pos) new))))))
 61 | 	  ((string= name "NEXTWD")
 62 | 	   (let ((w1 (id-for-token arg1)))
 63 | 	     #'(lambda (tokens tags pos)
 64 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
 65 | 			   #-mcl (type (simple-vector symbol ) tags)
 66 | 			   #-mcl (type fixnum pos)
 67 | 			   (optimize speed (safety 0)))
 68 | 		 (if (and (eq (svref tags pos) old)
 69 | 			  (eq (aref tokens (+ pos 1)) w1))
 70 | 		     (progn 
 71 | 		       (write-log tagger-contextual "NEXTWD: ~A @ ~A" contextual-rule pos)
 72 | 		       (setf (svref tags pos) new))))))
 73 | 	  ((string= name "RBIGRAM")
 74 | 	   (let ((me (id-for-token arg1))
 75 | 		 (next (id-for-token arg2)))
 76 | 	     #'(lambda (tokens tags pos)
 77 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
 78 | 			   #-mcl (type (simple-vector symbol ) tags)
 79 | 			   #-mcl (type fixnum pos)
 80 | 			   (optimize speed (safety 0)))
 81 | 		 (if (and (eq (svref tags pos) old)
 82 | 			  (eq (aref tokens pos) me)
 83 | 			  (eq (aref tokens (+ pos 1)) next))
 84 | 		     (progn 
 85 | 		       (write-log tagger-contextual "RBIGRAM: ~A @ ~A" contextual-rule pos)
 86 | 		       (setf (svref tags pos) new))))))
 87 | 	  ((string= name "WDNEXTTAG")
 88 | 	   (let ((me (id-for-token arg1))
 89 | 		 (t2 (mkkeysym arg2)))
 90 | 	     #'(lambda (tokens tags pos)
 91 | 		 (declare (type  #-mcl (simple-vector fixnum ) tokens)
 92 | 			  #-mcl (type (simple-vector symbol ) tags)
 93 | 			  #-mcl (type fixnum pos)
 94 | 			  (optimize speed (safety 0)))
 95 | 		 (if (and (eq (svref tags pos) old)
 96 | 			  (eq (aref tokens pos) me)
 97 | 			  (eq (svref tags (+ pos 1)) t2))
 98 | 		     (progn 
 99 | 		       (write-log tagger-contextual "WDNEXTTAG: ~A @ ~A" contextual-rule pos)
100 | 		       (setf (svref tags pos) new))))))
101 | 	  ((string= name "WDAND2AFT")
102 | 	   (let ((w1 (id-for-token arg1))
103 | 		 (w2 (id-for-token arg2)))
104 | 	     #'(lambda (tokens tags pos)
105 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
106 | 			   #-mcl (type (simple-vector symbol ) tags)
107 | 			   #-mcl (type fixnum pos)
108 | 			   (optimize speed (safety 0)))
109 | 		 (if (and (eq (svref tags pos) old)
110 | 			  (eq (aref tokens pos) w1)
111 | 			  (eq (aref tokens (+ pos 2)) w2))
112 | 		     (progn 
113 | 		       (write-log tagger-contextual "WDAND2AFT: ~A @ ~A" contextual-rule pos)
114 | 		       (setf (svref tags pos) new))))))
115 | 	  ((string= name "WDAND2TAGAFT")
116 | 	   (let ((w1 (id-for-token arg1))
117 | 		 (t2 (mkkeysym arg2)))
118 | 	     #'(lambda (tokens tags pos)
119 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
120 | 			   #-mcl (type (simple-vector symbol ) tags)
121 | 			   #-mcl (type fixnum pos)
122 | 			   (optimize speed (safety 0)))
123 | 		 (if (and (eq (svref tags pos) old)
124 | 			  (eq (aref tokens pos) w1)
125 | 			  (eq (svref tags (+ pos 2)) t2))
126 | 		     (progn 
127 | 		       (write-log tagger-contextual "WDAND2TAGAFT: ~A @ ~A" contextual-rule pos)
128 | 		       (setf (svref tags pos) new))))))
129 | 	  ((string= name "NEXT2TAG")
130 | 	   (let ((t1 (mkkeysym arg1)))
131 | 	     #'(lambda (tokens tags pos)
132 | 		 (declare (ignore tokens)
133 | 			  #-mcl (type (simple-vector symbol ) tags)
134 | 			  #-mcl (type fixnum pos))
135 | 		 (if (and (eq (svref tags pos) old)
136 | 			  (eq (svref tags (+ pos 2)) t1))
137 | 		     (progn 
138 | 		       (write-log tagger-contextual "NEXT2TAG: ~A @ ~A" contextual-rule pos)
139 | 		       (setf (svref tags pos) new))))))
140 | 	  ((string= name "NEXT2WD")
141 | 	   (let ((w1 (id-for-token arg1)))
142 | 	     #'(lambda (tokens tags pos)
143 | 		 (if (and (eq (svref tags pos) old)
144 | 			  (eq (aref tokens (+ pos 2)) w1))
145 | 		     (progn 
146 | 		       (write-log tagger-contextual "NEXT2WD: ~A @ ~A" contextual-rule pos)
147 | 		       (setf (svref tags pos) new))))))
148 | 	  ((string= name "NEXTBIGRAM")
149 | 	   (let ((t1 (mkkeysym arg1))
150 | 		 (t2 (mkkeysym arg2)))
151 | 	     #-mcl (declare (type symbol t1 t2))
152 | 	     #'(lambda (tokens tags pos)
153 | 		 (declare (ignore tokens)
154 | 			  #-mcl (type (simple-vector symbol ) tags)
155 | 			  #-mcl (type fixnum pos))
156 | 		 (if (and (eq (svref tags pos) old)
157 | 			  (eq (svref tags (+ pos 1)) t1)
158 | 			  (eq (svref tags (+ pos 2)) t2))
159 | 		     (progn 
160 | 		       (write-log tagger-contextual "NEXTBIGRAM: ~A @ ~A" contextual-rule pos)
161 | 		       (setf (svref tags pos) new))))))
162 | 	  ((string= name "NEXT1OR2TAG")
163 | 	   (let ((t1 (mkkeysym arg1)))
164 | 	     #-mcl (declare (type symbol t1))
165 | 	     #'(lambda (tokens tags pos)
166 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
167 | 			  #-mcl (type (simple-vector symbol ) tags)
168 | 			  #-mcl (type fixnum pos)
169 | 			  (optimize speed (safety 0))
170 | 			  (ignore tokens))
171 | 		 (if (and (eq (svref tags pos) old)
172 | 			  (or (eq (svref tags (+ pos 1)) t1)
173 | 			      (eq (svref tags (+ pos 2)) t1)))
174 | 		     (progn 
175 | 		       (write-log tagger-contextual "NEXT1OR2TAG: ~A @ ~A" contextual-rule pos)
176 | 		       (setf (svref tags pos) new))))))
177 | 	  ((string= name "NEXT1OR2WD")
178 | 	   (let ((w1 (id-for-token arg1)))
179 | 	     (declare (type fixnum w1))
180 | 	     #'(lambda (tokens tags pos)
181 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
182 | 			   #-mcl (type (simple-vector symbol ) tags)
183 | 			   #-mcl (type fixnum pos)
184 | 			   (optimize speed (safety 0)))
185 | 		 (if (and (eq (svref tags pos) old)
186 | 			  (or (eq (aref tokens (+ pos 1)) w1)
187 | 			      (eq (aref tokens (+ pos 2)) w1)))
188 | 		     (progn 
189 | 		       (write-log tagger-contextual "NEXT1OR2WD: ~A @ ~A" contextual-rule pos)
190 | 		       (setf (svref tags pos) new))))))
191 | 	  ((string= name "NEXT1OR2OR3TAG")
192 | 	   (let ((t1 (mkkeysym arg1)))
193 | 	     (declare (type symbol t1))
194 | 	     #'(lambda (tokens tags pos)
195 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
196 | 			   #-mcl (type (simple-vector symbol ) tags)
197 | 			   #-mcl (type fixnum pos)
198 | 			   (optimize speed (safety 0))
199 | 			   (ignore tokens))
200 | 		 (if (and (eq (svref tags pos) old)
201 | 			  (or (eq (svref tags (+ pos 1)) t1)
202 | 			      (eq (svref tags (+ pos 2)) t1)
203 | 			      (eq (svref tags (+ pos 3)) t1)))
204 | 		     (progn 
205 | 		       (write-log tagger-contextual "NEXT1OR2OR3TAG: ~A @ ~A" contextual-rule pos)
206 | 		       (setf (svref tags pos) new))))))
207 | 	  ((string= name "NEXT1OR2OR3WD")
208 | 	   (let ((w1 (id-for-token arg1)))
209 | 	     (declare (type fixnum w1))
210 | 	     #'(lambda (tokens tags pos)
211 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
212 | 			  #-mcl (type (simple-vector symbol ) tags)
213 | 			  #-mcl (type fixnum pos)
214 | 			  (optimize speed (safety 0)))
215 | 		 (if (and (eq (svref tags pos) old)
216 | 			  (or (eq (aref tokens (+ pos 1)) w1)
217 | 			      (eq (aref tokens (+ pos 2)) w1)
218 | 			      (eq (aref tokens (+ pos 2)) w1)))
219 | 		     (progn 
220 | 		       (write-log tagger-contextual "NEXT1OR2OR3WD: ~A @ ~A" contextual-rule pos)
221 | 		       (setf (svref tags pos) new))))))
222 | 	  ((string= name "PREVTAG")
223 | 	   (let ((t1 (mkkeysym arg1)))
224 | 	     (declare (type symbol t1))
225 | 	     #'(lambda (tokens tags pos)
226 | 		 (declare #-mcl (type (simple-array fixnum ) tokens)
227 | 			  #-mcl (type (simple-array symbol ) tags)
228 | 			  #-mcl (type fixnum pos)
229 | 			  (optimize speed (safety 0))
230 | 			  (ignore tokens))
231 | 		 (if (and (eq (svref tags pos) old)
232 | 			  (eq (svref tags (- pos 1)) t1))
233 | 		     (progn 
234 | 		       (write-log tagger-contextual "PREVTAG: ~A @ ~A" contextual-rule pos)
235 | 		       (setf (svref tags pos) new))))))
236 | 	  ((string= name "PREVWD")
237 | 	   (let ((w1 (id-for-token arg1)))
238 | 	     #-mcl (declare (type fixnum w1))
239 | 	     #'(lambda (tokens tags pos)
240 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
241 | 			  #-mcl (type (simple-vector symbol ) tags)
242 | 			  #-mcl (type fixnum pos)
243 | 			  (optimize speed (safety 0)))
244 | 		 (if (and (eq (svref tags pos) old)
245 | 			  (eq (aref tokens (- pos 1)) w1))
246 | 		     (progn 
247 | 		       (write-log tagger-contextual "PREVWD: ~A @ ~A" contextual-rule pos)
248 | 		       (setf (svref tags pos) new))))))
249 | 	  ((string= name "LBIGRAM")
250 | 	   (let ((w1 (id-for-token arg1))
251 | 		 (w2 (id-for-token arg2)))
252 | 	     #-mcl (declare (type fixnum w1 w2))
253 | 	     #'(lambda (tokens tags pos)
254 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
255 | 			  #-mcl (type (simple-vector symbol ) tags)
256 | 			  #-mcl (type fixnum pos)
257 | 			  (optimize speed (safety 0)))
258 | 		 (if (and (eq (svref tags pos) old)
259 | 			  (eq (aref tokens pos) w1)
260 | 			  (eq (aref tokens (- pos 1)) w2))
261 | 		     (progn 
262 | 		       (write-log tagger-contextual "LBIGRAM: ~A @ ~A" contextual-rule pos)
263 | 		       (setf (svref tags pos) new))))))
264 | 	  ((string= name "WDPREVTAG")
265 | 	   (let ((me (id-for-token arg1))
266 | 		 (t2 (mkkeysym arg2)))
267 | 	     #-mcl (declare (type fixnum me) (type symbol t2))
268 | 	     #'(lambda (tokens tags pos)
269 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
270 | 			   #-mcl (type (simple-vector symbol ) tags)
271 | 			   #-mcl (type fixnum pos)
272 | 			   (optimize speed (safety 0)))
273 | 		 (if (and (eq (svref tags pos) old)
274 | 			  (eq (aref tokens pos) me)
275 | 			  (eq (svref tags (- pos 1)) t2))
276 | 		     (progn 
277 | 		       (write-log tagger-contextual "WDPREVTAG: ~A @ ~A" contextual-rule pos)
278 | 		       (setf (svref tags pos) new))))))
279 | 	  ((string= name "WDAND2BFR")
280 | 	   (let ((w1 (id-for-token arg1))
281 | 		 (w2 (id-for-token arg2)))
282 | 	     #-mcl (declare (type fixnum w1 w2))
283 | 	     #'(lambda (tokens tags pos)
284 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
285 | 			   #-mcl (type (simple-vector symbol ) tags)
286 | 			   #-mcl (type fixnum pos)
287 | 			   (optimize speed (safety 0)))
288 | 		 (if (and (eq (svref tags pos) old)
289 | 			  (eq (aref tokens pos) w1)
290 | 			  (eq (aref tokens (- pos 2)) w2))
291 | 		     (progn 
292 | 		       (write-log tagger-contextual "WDAND2BFR: ~A @ ~A" contextual-rule pos)
293 | 		       (setf (svref tags pos) new))))))
294 | 	  ((string= name "WDAND2TAGBFR")
295 | 	   (let ((w1 (id-for-token arg1))
296 | 		 (t2 (mkkeysym arg2)))
297 | 	     #-mcl (declare (type fixnum w1) (type symbol t2))
298 | 	     #'(lambda (tokens tags pos)
299 | 		 (declare #-mcl (type fixnum pos)
300 | 			  #-mcl (type (simple-array fixnum ) tokens)
301 | 			  #-mcl (type (simple-array symbol ) tags)
302 | 			  (optimize speed (safety 0)))
303 | 		 (if (and (eq (svref tags pos) old)
304 | 			  (eq (aref tokens pos) w1)
305 | 			  (eq (svref tags (- pos 2)) t2))
306 | 		     (progn 
307 | 		       (write-log tagger-contextual "WDAND2TAGBFR: ~A @ ~A" contextual-rule pos)
308 | 		       (setf (svref tags pos) new))))))
309 | 	  ((string= name "PREV2TAG")
310 | 	   (let ((t1 (mkkeysym arg1)))
311 | 	     #'(lambda (tokens tags pos)
312 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
313 | 			  #-mcl (type (simple-vector symbol ) tags)
314 | 			  #-mcl (type fixnum pos)
315 | 			  (optimize speed (safety 0))
316 | 			  (ignore tokens))
317 | 		 (if (and (eq (svref tags pos) old)
318 | 			  (eq (svref tags (- pos 2)) t1))
319 | 		     (progn 
320 | 		       (write-log tagger-contextual "PREV2TAG: ~A @ ~A" contextual-rule pos)
321 | 		       (setf (svref tags pos) new))))))
322 | 	  ((string= name "PREV2WD")
323 | 	   (let ((w1 (id-for-token arg1)))
324 | 	     #'(lambda (tokens tags pos)
325 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
326 | 			   #-mcl (type (simple-vector symbol ) tags)
327 | 			   #-mcl (type fixnum pos)
328 | 			   (optimize speed (safety 0)))
329 | 		 (if (and (eq (svref tags pos) old)
330 | 			  (eq (aref tokens (- pos 2)) w1))
331 | 		     (progn 
332 | 		       (write-log tagger-contextual "PREV2WD: ~A @ ~A" contextual-rule pos)
333 | 		       (setf (svref tags pos) new))))))
334 | 	  ((string= name "PREV1OR2TAG")
335 | 	   (let ((t1 (mkkeysym arg1)))
336 | 	     #'(lambda (tokens tags pos)
337 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
338 | 			  #-mcl (type (simple-vector symbol ) tags)
339 | 			  #-mcl (type fixnum pos)
340 | 			  (optimize speed (safety 0))
341 | 			  (ignore tokens))
342 | 		 (if (and (eq (svref tags pos) old)
343 | 			  (or (eq (svref tags (- pos 1)) t1)
344 | 			      (eq (svref tags (- pos 2)) t1)))
345 | 		     (progn 
346 | 		       (write-log tagger-contextual "PREV1OR2TAG: ~A @ ~A" contextual-rule pos)
347 | 		       (setf (svref tags pos) new))))))
348 | 	  ((string= name "PREV1OR2WD")
349 | 	   (let ((w1 (id-for-token arg1)))
350 | 	     #'(lambda (tokens tags pos)
351 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
352 | 			  #-mcl (type (simple-vector symbol ) tags)
353 | 			  #-mcl (type fixnum pos)
354 | 			  (optimize speed (safety 0)))
355 | 		 (if (and (eq (svref tags pos) old)
356 | 			  (or (eq (aref tokens (- pos 1)) w1)
357 | 			      (eq (aref tokens (- pos 2)) w1)))
358 | 		     (progn 
359 | 		       (write-log tagger-contextual "PREV1OR2WD: ~A @ ~A" contextual-rule pos)
360 | 		       (setf (svref tags pos) new))))))
361 | 	  ((string= name "PREV1OR2OR3TAG")
362 | 	   (let ((t1 (mkkeysym arg1)))
363 | 	     #'(lambda (tokens tags pos)
364 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
365 | 			  #-mcl (type (simple-vector symbol ) tags)
366 | 			  #-mcl (type fixnum pos)
367 | 			  (optimize speed (safety 0))
368 | 			  (ignore tokens))
369 | 		 (if (and (eq (svref tags pos) old)
370 | 			  (or (eq (svref tags (- pos 1)) t1)
371 | 			      (eq (svref tags (- pos 2)) t1)
372 | 			      (eq (svref tags (- pos 3)) t1)))
373 | 		     (progn 
374 | 		       (write-log tagger-contextual "PREV1OR2OR3TAG: ~A @ ~A" contextual-rule pos)
375 | 		       (setf (svref tags pos) new))))))
376 | 	  ((string= name "PREV1OR2OR3WD")
377 | 	   (let ((w1 (id-for-token arg1)))
378 | 	     #'(lambda (tokens tags pos)
379 | 		 (declare #-mcl (type (simple-vector fixnum ) tokens)
380 | 			  #-mcl (type (simple-vector symbol ) tags)
381 | 			  #-mcl (type fixnum pos)
382 | 			  (optimize speed (safety 0)))
383 | 		 (if (and (eq (svref tags pos) old)
384 | 			  (or (eq (aref tokens (- pos 1)) w1)
385 | 			      (eq (aref tokens (- pos 2)) w1)
386 | 			      (eq (aref tokens (- pos 3)) w1)))
387 | 		     (progn 
388 | 		       (write-log tagger-contextual "PREV1OR2OR3WD: ~A @ ~A" contextual-rule pos)
389 | 		       (setf (svref tags pos) new))))))
390 | 	  ((string= name "PREVBIGRAM")
391 | 	   (let ((t1 (mkkeysym arg1))
392 | 		 (t2 (mkkeysym arg2)))
393 | 	     #'(lambda (tokens tags pos)
394 | 		 (declare  #-mcl (type (simple-vector fixnum ) tokens)
395 | 			   #-mcl (type (simple-vector symbol ) tags)
396 | 			   #-mcl (type fixnum pos)
397 | 			   (optimize speed (safety 0))
398 | 			   (ignore tokens))
399 | 		 (if (and (eq (svref tags pos) old)
400 | 			  (eq (svref tags (- pos 1)) t2)
401 | 			  (eq (svref tags (- pos 2)) t1))
402 | 		     (progn 
403 | 		       (write-log tagger-contextual "PREVBIGRAM: ~A @ ~A" contextual-rule pos)
404 | 		       (setf (svref tags pos) new))))))
405 | 	  (t (write-log tagger-contextual "Unrecognized rule: ~A" contextual-rule)))))
406 | 
407 | 


--------------------------------------------------------------------------------
/src/tagger.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          tagger
  6 | ;;;; Purpose:       Lisp version of the Brill tagger w/ WSJ/BROWN data
  7 | ;;;;
  8 | ;;;; Programmer:    Ian S. Eslick
  9 | ;;;; Date Started:  October 2004
 10 | ;;;;
 11 | 
 12 | (in-package :langutils)
 13 | 
 14 | ;; ==========================================
 15 | ;; TAGGER STATE
 16 | 
 17 | ;; Variables to hold the datafiles
 18 | (defvar *tagger-lexical-rules* nil
 19 |   "Table to hold the lexical rule closures")
 20 | (defvar *tagger-contextual-rules* nil
 21 |   "Table to hold the contextual rule closures")
 22 | (defvar *tagger-bigrams* nil
 23 |   "Bigram hash (not implemented yet)")
 24 | (defvar *tagger-wordlist* nil
 25 |   "Wordlist hash (not implemented yet)")
 26 | 
 27 | ;; ==========================================
 28 | ;; STRING TO STRING TAGGING 
 29 | ;; Speed: moderately fast
 30 | ;; Size: Designed for strings on order of Dcache size
 31 |  
 32 | (defun tag ( string )
 33 |   (with-output-to-string (stream)
 34 |     (print-vector-document (vector-tag string) :stream stream)))
 35 | 
 36 | (defun tag-tokenized ( string )
 37 |   (with-output-to-string (stream)
 38 |     (print-vector-document (vector-tag-tokenized string) :stream stream)))
 39 | 
 40 | ;; ===============================
 41 | ;; FILE TAGGING - slow, small files
 42 | ;; Speed: slow
 43 | ;; Input size: Works on small files
 44 | 
 45 | (defun read-file-to-string (file)
 46 |   (with-output-to-string (stream)
 47 |     (with-open-file (file file)
 48 |       (do-stream-lines (line file)
 49 | 	(format stream "~A~%" line)))))
 50 | 
 51 | (defun-exported read-file-as-tagged-document (file)
 52 |   (vector-tag (read-file-to-string file)))
 53 | 
 54 | (defun-exported read-and-tag-file (file)
 55 |   (tag (read-file-to-string file)))
 56 | 
 57 | ;; =================================
 58 | ;; STRING TO VECTOR-DOCUMENT TAGGING
 59 | ;; Speed: optimal
 60 | ;; Input size: < 100k strings
 61 | 
 62 | (defun test-vector-tag-tokenized ( string )
 63 |   (time (vector-tag-tokenized string)))
 64 | 
 65 | (defun vector-tag ( string )
 66 |   "Returns a 'document' which is a class containing a pair of vectors 
 67 |    representing the string in the internal token format.  Handles arbitrary data."
 68 |   (vector-tag-tokenized (mvbind (succ consumed data remainder) (tokenize-string string)
 69 | 				(declare (ignore succ consumed remainder))
 70 | 				data)))
 71 | (let ((temp-tokens (make-array 100000 :element-type 'fixnum :initial-element 0 :adjustable t))
 72 |       (temp-tags (make-array 100000 :element-type 'symbol :initial-element :NN :adjustable t))
 73 |       (temp-string (make-array 10000 :initial-element #\a :element-type 'character :fill-pointer t :adjustable nil)))
 74 |   (declare (optimize (speed 3) (safety 0) (debug 0))
 75 | 	   (type (array fixnum (*)) temp-tokens)
 76 | 	   (type (array symbol (*)) temp-tags)
 77 | 	   (type (array character (*)) temp-string))
 78 |   (defun write-temp ( token tag pos )
 79 |     (declare (type fixnum token pos)
 80 | 	     (type symbol tag)
 81 | 	     (optimize (speed 3) (safety 0) (debug 0) (space 0)))
 82 |     (setf (aref temp-tokens pos) token)
 83 |     (setf (aref temp-tags pos) tag)
 84 |     nil)
 85 |   
 86 |   (defun duplicate-from ( source start end )
 87 |     (declare ;;(inline aref)
 88 | 	     (type fixnum source start end)
 89 | 	     (optimize (speed 3) (safety 0) (debug 0) (space 0)))
 90 |     (loop for pos fixnum from start to (1- end) do
 91 | 	  (progn 
 92 | 	    (setf (aref temp-tokens pos) (aref temp-tokens source))
 93 | 	    (setf (aref temp-tags pos) (aref temp-tags source))
 94 | 	    nil)))
 95 | 
 96 |   (defun vector-tag-tokenized (string &key (end-tokens nil))
 97 |     "Returns a document representing the string using the
 98 |      internal token dictionary; requires the string to be tokenized.
 99 |      Parses the string into tokens (whitespace separators) then populates 
100 |      the two temp arrays above with token id's and initial tags.  Contextual
101 |      rules are applied and a new vector document is produced which
102 |      is a copy of the enclosed data.  This is all done at once so good
103 |      compilers can open-code the array refs and simplify the calling 
104 |      of the labels functions.
105 |     "
106 |     (declare (optimize (speed 3) (safety 1) (space 0) (debug 0)))
107 |     (labels ((initial-tag-all ()
108 | 			      (let ((array-offset 3)
109 | 				    (temp-index 0))
110 | 				(declare (type fixnum temp-index array-offset))
111 | 				(loop for char character across string do
112 | 				      (if (not (constituent char))
113 | 					  (if (= temp-index 0)
114 | 					      (continue)
115 | 					      (progn (setf (fill-pointer temp-string) temp-index)
116 | 						     (mvbind (tokid tagid) (initial-tag temp-string)
117 | 							     (write-temp tokid tagid array-offset))
118 | 						     (incf array-offset)
119 | 						     (assert (< array-offset 99990))
120 | 						     (setf temp-index 0)))
121 | 					(progn (setf (char temp-string temp-index) char)
122 | 					       (incf temp-index))))
123 | 				(unless (= temp-index 0)
124 | 				  (setf (fill-pointer temp-string) temp-index)
125 | 				  (mvbind (tokid tagid) (initial-tag temp-string)
126 | 					  (write-temp tokid tagid array-offset))
127 | 				  (incf array-offset))
128 | 				;; Append end tokens
129 | 				(loop for token in end-tokens do
130 | 				     (mvbind (tokid tagid) (initial-tag token)
131 | 					     (write-temp tokid tagid array-offset))
132 | 				     (incf array-offset))
133 | 				;; Put valid tokens in beginning and end of pattern-match array
134 | 				(duplicate-from 3 0 3)
135 | 				(duplicate-from (- array-offset 1) (+ 3 array-offset) (+ 6 array-offset))
136 | 				(- array-offset 3))))
137 |       ;; Setup arrays and populate initial tags
138 |       (let ((elements (initial-tag-all)))
139 | 	(declare (type fixnum elements))
140 | 	;; Run contextual fixup rules
141 | 	(apply-contextual-rules elements)
142 | 	;; Fresh copy of the resulting data
143 | 	(return-vector-doc elements))))
144 | 
145 |   (defun apply-contextual-rules (elements)
146 |     (declare (optimize (speed 3) (safety 1) (space 0) (debug 0))
147 | 	     (type fixnum elements))
148 | ;;    (with-print-clock-cycles (1.67 :unit-name "cycles per rule app" :norm-f 
149 | ;;			  #'(lambda (cycles) (/ cycles (* elements (length *tagger-contextual-rules*) 1.0))))
150 |       (loop for pos fixnum from 3 upto (+ 3 elements) do
151 | 	    (loop for rule function in *tagger-contextual-rules* do
152 | 		  (funcall rule temp-tokens temp-tags pos))))
153 | 
154 |   (defun return-vector-doc (elements)
155 |     (declare (optimize (speed 3) (safety 0) (space 0) (debug 0)))
156 |     (make-instance 'vector-document 
157 | 		   :text (subseq temp-tokens 3 (+ elements 3))
158 | 		   :tags (subseq temp-tags 3 (+ elements 3)))))
159 | 
160 | 
161 | (defun initial-tag ( token )
162 |   "Return an initial tag for a given token string using the langutils 
163 |    lexicon and the tagger lexical rules (via guess-tag)"
164 |   (declare (optimize (speed 3) (safety 1) (debug 0) (space 0)))
165 |   (let ((id (id-for-token token nil)))
166 |     (declare (type fixnum id))
167 |     (aif (hash-get *lexicon* id)
168 | 	 (values (lexicon-entry-id it) (lexicon-entry-tag it)) ;; token id, best tag
169 | 	 (let ((tag (guess-tag token (default-tag token) *tagger-lexical-rules*)))
170 | 	   (add-unknown-lexicon-entry id tag)
171 | 	   (values id tag)))))
172 | 
173 | (defun default-tag ( token )
174 |   "Simple default tagging based on capitalization of token string"
175 |   (declare (optimize (speed 3) (safety 0) (debug 0) (space 0)))
176 |   (if (and (> (char-code (char token 0)) (char-code #\A))
177 | 	   (< (char-code (char token 0)) (char-code #\Z)))
178 |       :NNP
179 |     :NN))
180 | 
181 | ;; ==========================================
182 | ;; TAGGER INITIALIZATION
183 | 
184 | (defun-exported init-tagger (&optional lexical-rule-file contextual-rule-file)
185 |   (write-log tagger-init "Initializing the tagger")
186 |   ;; Handle vector tags and tokens
187 |   (unless (and *tagger-lexical-rules* *tagger-contextual-rules*)
188 |     ;; Load the files
189 |     (load-tagger-files lexical-rule-file contextual-rule-file)))
190 | 
191 | (defun load-tagger-files ( lexical-rules contextual-rules &key bigrams wordlist )
192 |   (declare (ignore bigrams wordlist))
193 |   (setf *tagger-lexical-rules* (load-lexical-rules lexical-rules *tagger-bigrams* *tagger-wordlist*))
194 |   (setf *tagger-contextual-rules* (load-contextual-rules contextual-rules))
195 |   nil)
196 | 
197 | (defun-exported clean-tagger ()
198 |   (clean-lexicon)
199 |   (setf *tagger-lexical-rules* nil)
200 |   (setf *tagger-contextual-rules* nil)
201 |   (setf *tagger-bigrams* nil)
202 |   (setf *tagger-wordlist* nil))
203 | 
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/src/tokenize.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          tokenize
  6 | ;;;; Purpose:       Fast recursive descent stream and string tokenizer based on meta
  7 | ;;;;                
  8 | ;;;; Programmer:    Ian S. Eslick
  9 | ;;;; Date Started:  September 2004
 10 | ;;;;
 11 | 
 12 | (in-package :langutils-tokenize)
 13 | 
 14 | (eval-when (:compile-toplevel :load-toplevel :execute)
 15 |   (defvar known-abbreviations 
 16 |     '("Apr" "Assn" "Aug" "Av" "Ave" "Bldg" "Cf" "Co"
 17 |       "Corp" "Ct" "Dec" "Dept" "Dist" "Dr" "Eq" "Feb" "Fig" "Figs"
 18 |       "Gov" "Inc" "Jan" "Jas" "Jr" "Jul" "Jun" "Lt" "Ltd" "MM"
 19 |       "Mar" "May" "Mfg" "Mme" "Mr" "Mrs" "Ms" "Msec" "Mt" "Mts"
 20 |       "No" "Nov" "Oct" "Op" "Rd" "Rep" "Rte" "Sen" "Sep" "Sr"
 21 |       "St" "Stat" "Tech" "USN" "Vol" "Vs" "Yo" "a" "al" "apr"
 22 |       "aug" "bur" "ca" "cc" "cf" "cf" "cm" "cu" "dec" "dia" "ed"
 23 |       "eds" "eg" "eqn" "eqns" "est" "etc" "ex" "feb" "fig" "figs"
 24 |       "ft" "gm" "hp" "hr" "jan" "jul" "jun" "kc" "l" "lb" "lbs"
 25 |       "m" "mEq" "mar" "may" "mc" "mg" "mil" "min" "ml" "mm" "mos"
 26 |       "nov" "nw" "oct" "oz" "p" "pl" "pp" "prop" "r" "repr" "rev"
 27 |       "sec" "sep" "sq" "trans" "v" "var" "viz" "vol" "vols" "vs"
 28 |       "yd" "yrs")))
 29 | 
 30 | ;;(defun ctoi (d) (- (char-code d) #.(char-code #\0)))
 31 | (deftype digit () '(member #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
 32 | (deftype non-digit () '(not (member #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)))
 33 | (deftype non-digit-or-ws () '(not (or whitespace non-digit)))
 34 | 
 35 | (deftype whitespace () '(member #\Tab #\Space #\Newline #\Return))
 36 | (deftype non-whitespace () '(not (member #\Tab #\Space #\Newline #\Return)))
 37 | 
 38 | (defun alpha-lowercase (ch)
 39 |   "Return T if the given character is an alpha character"
 40 | ;;  (and (>= (char-code ch) #\a) (<= (char-code ch) #\z)))
 41 |   (member ch
 42 | 	  '(#\a #\b #\c #\d #\e #\f #\g #\h #\i #\j #\k #\l #\m
 43 | 	    #\n #\o #\p #\q #\r #\s #\t #\u #\v #\w #\x #\y #\z)))
 44 | (defun alpha-uppercase (ch)
 45 | ;;  (and (>= (char-code ch) #\A) (<= (char-code ch) #\Z)))
 46 |   (member ch
 47 | 	  '(#\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M
 48 | 	    #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z)))
 49 | (defun alpha-misc (ch)
 50 |   (member ch '(#\_ #\-)))
 51 | 
 52 | (deftype alpha () '(or (satisfies alpha-lowercase)
 53 | 			 (satisfies alpha-uppercase)
 54 | 			 (satisfies alpha-misc)))
 55 | 
 56 | (deftype alpha-upper () '(satisfies alpha-uppercase))
 57 | (deftype alpha-lower () '(satisfies alpha-lowercase))
 58 |   
 59 | (deftype punctuation () '(member #\] #\[ #\? #\! #\( #\) #\\ #\" #\; #\{ #\} #\: #\, #\/ #\'))
 60 | 
 61 | (deftype alphanum () '(or digit alpha punctuation))
 62 | (deftype non-punc-or-white () '(not (or whitespace punctuation)))
 63 | 
 64 | (define-condition end-of-sentence (condition) ())
 65 | 
 66 | ;;
 67 | ;; NOTE: make result fixed in size, run tokenizer in passes and
 68 | ;; stitch results together
 69 | ;;
 70 | 
 71 | ;; Reads a stream into a string
 72 | #.(enable-meta-syntax)
 73 | 
 74 | (let* ((length 1024)
 75 |        (result (make-array length :element-type 'character :adjustable t)))
 76 |   (declare (type fixnum length))
 77 | (defun tokenize-stream (stream &key (by-sentence nil) (fragment "")
 78 | 			       &aux (index 0) (start 0) (ch #\Space) (ws #\Space)
 79 | 			            (status :running) (sentence? nil))
 80 |   "Converts a stream into a string and tokenizes, optionally, one sentence 
 81 |    at a time which is nice for large files.  Pretty hairy code: a token 
 82 |    processor inside a stream scanner.  The stream scanner walks the input stream
 83 |    and tokenizes all punctuation (except periods).  After a sequences of 
 84 |    non-whitespace has been read, the inline tokenizer looks at the end of the 
 85 |    string for mis-tokenized words (can ' t -> ca n't)"
 86 |   (declare (type fixnum index start)
 87 | 	   (type character ch ws)
 88 | 	   (type boolean sentence?)
 89 | ;;	   (inline peek-char read-char char)
 90 | 	   (optimize (speed 3) (safety 0) (debug 1)))
 91 |   (with-stream-meta (str stream)
 92 |       (macrolet ((copy-fragment ()
 93 | 				`(progn 
 94 | 				   (loop for i fixnum from 0 to (1- (length fragment)) do
 95 | 				     (setf (char result i) (char fragment i)))))
 96 | 		 (make-result-buffer ()
 97 | 				     `(progn
 98 | 					(setq index (length fragment))
 99 | 					(setq result (make-array index
100 | 								 :element-type 'character
101 | 								 :adjustable t
102 | 								 :initial-contents fragment))
103 | 					(when (> length index)
104 | 					  (setq result (adjust-array result length)))))
105 | 		 (extend-result-buffer ()
106 | 				       `(when (> index (/ length 2))
107 | 					  (setq length (* length 2))
108 | 					  (setq result (adjust-array result length))))
109 | 		 (array-to-string ()
110 | 				  `(progn
111 | 				     (subseq result 0 index)))
112 | 		 (write-ws () `(progn 
113 | 				 (setf (char result index) #\Space)
114 | 				 (incf index)
115 | 				 t))
116 | 		 (write-ch () `(progn
117 | ;;				 (format t "writing ~A at ~A~%" ch index)
118 | 				 (setf (char result index) ch)
119 | 				 (incf index)
120 | 				 t))
121 | 		 ;; This split of sentence detection is because cap letters in the next sentence 
122 | 		 ;; are the only exception to single token analysis.  I'll have to refactor this if
123 | 		 ;; we need next token lookahead in the future for other punctuation fixup.  
124 | 		 ;; Potential sentence just indicates that the period was not otherwise classified
125 | 		 (potential-sentence () 
126 | 				     `(progn 
127 | ;;					(format t "Potential!~%") 
128 | 					(setq sentence? t)))
129 | 		 ;; If the first letter of the next token is a capital, insert newline 
130 | 		 ;; to indicate sentence boundary
131 | 		 (check-sentence ()
132 | 				 '(progn
133 | 				    (when sentence? 
134 | ;;   				      (format t "Checking sentence: ~A~%" ch)
135 | 				      (if (typep ch 'alpha-upper)
136 | 					  (make-sentence)
137 | 					(setq sentence? nil)))
138 | 				    t)))
139 | 	;; BACKUP AND CLEANUP MISPARSED TOKEN SEQUENCES & IDENTIFY POTENTIAL SENTENCE BOUNDARIES
140 | 	(labels ((process-token-inline (&aux (write-index index))
141 |                    (declare ;;(inline peek-char)
142 | 			    (type fixnum write-index start index)
143 | 			    (optimize speed (safety 0)))
144 | 		   (when (= start index) (return-from process-token-inline t))
145 | 		   (let ((new-index
146 | 			  (with-string-meta (string result :start (if (= start 0) 0 (1- start)) :end write-index)
147 | ;;			    (format t "~A ~A \"~A\"~%" index end (subseq result index end))
148 | 			    (labels ((test-alpha ()
149 | 						 (let* ((ch (char string index))
150 | 							(code (char-code ch)))
151 | 						   (or (and (>= code (char-code #\a))
152 | 							    (<= code (char-code #\z)))
153 | 						       (and (<= code (char-code #\A))
154 | 							    (>= code (char-code #\Z)))
155 | 						       (or (eq code (char-code #\_))
156 | 							   (eq code (char-code #\-))))))
157 | 				     (swap (a b &aux temp)
158 | 					   (declare (optimize speed (safety 1)))
159 | ;;						    (inline char))
160 | 					   (setf temp (char result a))
161 | 					   (setf (char result a) (char string b))
162 | 					   (setf (char result b) temp))
163 | 				     (write-newline (pos)
164 | 						    (declare (type fixnum pos)
165 | 							     (optimize speed (safety 0)))
166 | ;;							     (inline char)))
167 | 						    (setf (char result pos) #\Newline))
168 | 				     ;; Go back to provided write ptr and walk forward to the
169 | 				     ;; original 'end' of valid data removing any spaces and
170 | 				     ;; decrementing the end and index to keep track of 
171 | 				     ;; the actual end of the string.
172 | 				     (delete-spaces (write spaces)
173 | 						    (delete-chars #\Space write spaces))
174 | 				     (delete-chars (char write count &aux (read write) (orig write))
175 | 						    (declare (type fixnum write read count)
176 | ;;							     (type character char)
177 | 							     (type (vector character) result)
178 | 							     (optimize speed (safety 0)))
179 | 						    (loop while (< write end) do
180 | 						      (if (and (< (- read write) count)
181 | 							       (eq (char result read) char))
182 | 							  (progn
183 | 							    (incf read)
184 | 							    (decf index)
185 | 							    (decf end))
186 | 							(progn
187 | ;;							  (format t "w~A:'~A' <- r~A:'~A'~%"
188 | ;;								  write (char string write)
189 | ;;								  read (char string read))
190 | 							  (setf (char result write)
191 | 								(char result read))
192 | 							  (incf write)
193 | 							  (incf read))))
194 | ;;						    (format t "after delete: ~A~%" (subseq string orig (1- end)))
195 | 						    t)
196 | 				     ;; Fix contractions of the type I'll (I ' ll -> I 'll)
197 | 				     (fix-will (&aux (old-index index)) 
198 | 					       (or
199 | 						(meta-match
200 | 						 [ #\Space #\' #\Space #\l #\l !(delete-spaces (- index 3) 1)])
201 | 						 (progn (setq index old-index) nil)))
202 | 				     ;; Fix possessives such as Fred (Fred ' s -> Fred's)
203 | 				     (fix-poss (&aux (old-index index) a d) 
204 | 					       (declare (type fixnum old-index index)
205 | ;;							(type character a d)
206 | 							(optimize speed (safety 0)))
207 | 					       (or
208 | 						(meta-match
209 | 						 [{!(test-alpha) @(digit d)} #\Space #\' #\Space #\s 
210 | 						 !(delete-spaces (- index 4) 2)])
211 | 						(progn (setq index old-index) nil)))
212 | 				     ;; Fix contractions of the type can't.  (can ' t -> ca n't)
213 | 				     (fix-cant (&aux (old-index index)) 
214 | 					       (or
215 | 						(meta-match
216 | 						 [#\n #\Space #\' #\Space #\t 
217 | 						 !(swap (- index 4) (- index 5))
218 | 						 !(delete-spaces (- index 2) 1)])
219 | 						(progn (setq index old-index) nil)))
220 | 				     ;; Repair tokens within numbers so they're recognizable later
221 | 				     (fix-numerics (&aux (old-index index) d);; 2 : 00 -> 2:00 
222 | 						   (or
223 | 						    (meta-match
224 | 						    [@(digit d) {#\Space} { #\: #\, #\/ } { #\Space } @(digit d)
225 | 						    !(delete-spaces (- index 4) 2)])
226 | 						    (progn (setq index old-index) nil)))
227 | 				     ;; Ignore abbreviations from the list and keep period marker
228 | 				     (fix-abbreviations (&aux (old-index index) ws)
229 | 							(or 
230 | 							 (meta-match 
231 | 							  [ @(whitespace ws) !(meta-dict known-abbreviations) 
232 | 							    #\. @(whitespace ws)])
233 | 							 (progn (setq index old-index) nil)))
234 | 				     ;; Keep tokens shortened using periods U.S.A, Ian S. Eslick, and so on
235 | 				     (fix-periods (&aux (old-index index) a)
236 | 						  (declare (optimize speed (safety 0))
237 | ;;							   (inline aref char)
238 | 							   (type string result)
239 | 							   (type fixnum old-index index))
240 | ;;							   (type character a ws))
241 | ;;						  (format t "~A~%" (subseq string index end))
242 | 						  (or 
243 | 						   (meta-match  ;; Assume sentence if not pass these filters
244 | 						    {;; Ian S. Eslick
245 | ;;						     [@(whitespace ws) @(alpha a) #\. @(whitespace ws) 
246 | ;;						      !(delete-chars #\. (- index 2) 1)] 
247 | 						          ;; Prop. , but -> Prop , but
248 | 						     [ #\. { [ ${ @(whitespace ws) } { #\, #\; } ]
249 | 						           ;; U.S. & m.p.h. -> U.S m.p.h
250 | 						           [ !(test-alpha) #\. @(whitespace ws) ] } ]
251 | 						     ;; Sentence end?: "word word. {W/w}"
252 | 						     [@(non-punc-or-white ws) #\. @(whitespace ws) !(potential-sentence)]
253 | 						     ;; Definitely a sentence!
254 | 						     [{ #\! #\? } @(whitespace ws) !(write-newline  (1- index))]
255 | 						    })
256 | 						   (progn (setq index old-index) nil))))
257 | 			      ;; NOTE: insert period handling and sentence detection
258 | 			      (meta-match
259 | 			       ${ {!(fix-numerics)
260 | 			           !(fix-will) 
261 | 				 !(fix-cant) 
262 | 				 !(fix-poss)
263 | ;;				 !(fix-abbreviations)
264 | 				 !(fix-periods)}
265 | 				 ;; gather ws
266 | 				 [@(whitespace ch) $[@(whitespace ch) !(delete-spaces (1- index) 1)]] 
267 | 				 ;; ignore, all good
268 | 				 [@(non-whitespace ch)] 
269 | 				})
270 | 			      index))))
271 | 		     (setq index new-index start new-index)
272 | 		     t))
273 | 		 ;; If we detect an end of sentence
274 | 		 (make-sentence ()  (setf (char result (- index 3)) #\Space)
275 | 				(setf (char result (- index 2)) #\.)
276 | 				(let ((ch (char result (- index 1))))
277 | 				  (setf (char result (- index 1)) #\Newline)
278 | 				  (write-ch))
279 | 				(when by-sentence
280 | 				  (setq fragment (subseq result (1- index) index))
281 | 				  (decf index 1)
282 | 				  (signal 'end-of-sentence))
283 | 				(setq sentence? nil)
284 | 				t))
285 | 	  ;; PROCESS THE STREAM INPUT
286 | 	  (declare (dynamic-extent #'process-token-inline))
287 | 	  (when fragment (copy-fragment))
288 | 	  (handler-case
289 | 	      (meta-match 
290 | 	       [${;; collapse whitespace to next token
291 | 	          [@(whitespace ws) $[@(whitespace ws)] !(write-ws) !(process-token-inline) !(extend-result-buffer)]
292 | 		  [@(punctuation ch) !(write-ws) !(write-ch) !(write-ws)] ;; expand syms [][?!()\";{}:,/
293 | 		  [@(non-punc-or-white ch) !(write-ch) !(check-sentence)] ;; accept everything else
294 | 		 }])
295 | 	    (end-of-file () (process-token-inline) (setq status :done))
296 | 	    (end-of-sentence () (setq status :done))
297 | 	    (condition (c) (pprint c) (break))))
298 | 	  (cond ((eq status :done)
299 | 		 (values t index (array-to-string) fragment))
300 | 		(t (values nil index (array-to-string)))) ;; characters read to error
301 | 	  ))))
302 | ;;end
303 | 
304 | #.(disable-meta-syntax)
305 | 
306 | (defun tokenize-string (string)
307 |   "Returns a fresh, linguistically tokenized string"
308 |   (with-input-from-string (s string)
309 |     (tokenize-stream s)))
310 | 
311 | ;;(defun tokenize-file (source target &key (if-exists :supersede))
312 | ;;  (?write-to-file 
313 | ;;   (stdutils:mvretn 3 
314 | ;;     (tokenize-string 
315 | ;;      (stdutils:read-file-to-string source)))
316 | ;;   target))
317 |     
318 | 
319 | ;; NOTE: Broken?  9/22/2004
320 | (defun tokenize-file2 (source-file target-file &key (if-exists :supersede) &aux (total 0) (remainder ""))
321 |   "Tokenizes a pure text file a sentence at a time"
322 |   (declare (type string remainder)
323 | 	   (type fixnum total))
324 |   (with-open-file (src source-file )
325 |     (with-open-file (targ target-file :direction :output :if-exists if-exists)
326 |       (loop while t do
327 | 	(multiple-value-bind (success count string rem)
328 | 	    (tokenize-stream src :by-sentence t :fragment remainder)
329 | 	  (format t "~A ~A ~A~%~A~%" success count string rem)
330 | 	  (incf total count)
331 | 	  (unless success
332 | 	    (return total))
333 | 	  (pprint string)
334 | 	  (setf remainder (copy-seq rem)))))))
335 | 


--------------------------------------------------------------------------------
/src/tokens.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          tokens
  6 | ;;;; Purpose:       Abstractions for word tokens, provides for unique textual token identity
  7 | ;;;;                will include all 'words' including abbreviations, misspellings, etc.
  8 | ;;;;
  9 | ;;;; Programmer:    Ian S. Eslick
 10 | ;;;; Date Started:  October 2004
 11 | 
 12 | 
 13 | (in-package :langutils)
 14 | 
 15 | ;; =============================
 16 | ;; TOKEN REPRESENTATION
 17 | ;;
 18 | 
 19 | ;; 
 20 | ;; Fishy token identification parameters
 21 | ;;
 22 | (defconstant *max-token-nums* 7
 23 |   "The maximum number of numbers allowed in a valid token")
 24 | (defconstant *max-token-others* 1
 25 |   "The maximum number of non alpha-numeric characters in a valid token")
 26 | (defvar *suspicious-words* (make-hash-table :size 10000 :rehash-size 1.5 :rehash-threshold 0.7)
 27 |   "Memoize known suspicious words that have been tokenized in this hash")
 28 | 
 29 | 
 30 | ;; Create a counter to track all novel terms, including
 31 | ;; capitalizations.
 32 | 
 33 | (defvar *token-counter* 0)
 34 | (defvar *token-table* nil)
 35 | (defvar *token-dirty-bit* nil)
 36 | (defvar *tokens-load-file* nil)
 37 | (defvar *id-table* nil)
 38 | 
 39 | (defun-exported get-token-count ()
 40 |   "Return the current token counter"
 41 |   *token-counter*)
 42 | 
 43 | (defun ensure-token-counts ()
 44 |   "Reset token count if not already set"
 45 |   (when (null *token-table*)
 46 |     (reset-token-counts)))
 47 | 
 48 | ;;(defun save-tokens ()
 49 | ;;  "Save the current token hash to disk if and only if
 50 | ;;   it has changed since it was loaded."
 51 | ;;  (when (and *token-dirty-bit* *tokens-load-file*)
 52 | ;;    (save-token-map *tokens-load-file*)))
 53 | 
 54 | (defun initialize-tokens ()
 55 |   (setf *token-table* (make-hash-table :size 200000 :rehash-size 1.2 :rehash-threshold 0.5))
 56 |   (setf *id-table* (make-hash-table :test #'equal :size 100000 :rehash-size 1.3 :rehash-threshold 0.8)))
 57 | 
 58 | (defun reset-token-counts ()
 59 |   "Reset all the token datastructures to an initialized but empty state"
 60 |   (write-log lexicon-init "Resetting token counts")
 61 | ;;  (save-tokens)
 62 |   (setf *token-counter* 0)
 63 |   (setf *external-token-map* nil)
 64 |   (setf *id-for-token-hook* nil)
 65 |   (setf *token-for-id-hook* nil)
 66 |   (setf *add-to-map-hook* nil)
 67 |   (setf *token-counter-hook* nil))
 68 | 
 69 | ;;
 70 | ;; Hook variables for persistent storage
 71 | ;;
 72 | 
 73 | (defvar *external-token-map* nil)
 74 | (defvar *id-for-token-hook* nil)
 75 | (defvar *token-for-id-hook* nil)
 76 | (defvar *add-to-map-hook* nil)
 77 | (defvar *token-counter-hook* nil)
 78 | 
 79 | 
 80 | ;;
 81 | ;; API
 82 | ;;
 83 | 
 84 | (defconstant *whitespace-chars* 
 85 |   (if (boundp '*whitespace-chars*) (symbol-value '*whitespace-chars*) 
 86 |       '(#\space #\tab #\return #\linefeed
 87 | 	#+allegro #\%space
 88 | 	#+lispworks #\No-Break-Space)))
 89 | 
 90 | (defun id-for-token ( token &optional (trim t) )
 91 |   "This takes string 'tokens' and 
 92 |    returns a unique id for that character
 93 |    sequence - beware of whitespace, etc."
 94 |   (declare ;; (type hash-table *id-table* *token-table* *suspicious-words*)
 95 | 	   (optimize speed (safety 1)))
 96 |   (if (null token) 
 97 |       0 ;; NOTE: Is this wise?
 98 |     (etypecase token
 99 |       (string 
100 |        (let ((basic-token (copy-seq (if trim (string-trim *whitespace-chars* token) token))))
101 | 	 (aif (hash-get *id-table* basic-token)
102 | 	      it
103 | 	      (aif (id-for-token-hook basic-token)
104 | 		   (progn
105 | 		     (hash-put *id-table* basic-token it)
106 | 		     (hash-put *token-table* it (string-downcase basic-token))
107 | 		     it)
108 | 		   (let ((id (if *token-counter-hook*
109 | 				 (token-counter-hook)
110 | 				 (incf *token-counter*))))
111 | 		     (if (suspicious-string? basic-token)
112 | 			 (hash-put *suspicious-words* id t))
113 | 		     (hash-put *token-table* id basic-token)
114 | 		     (hash-put *id-table* basic-token id)
115 | 		     (add-to-map-hook basic-token id)
116 | 		     (setf *token-dirty-bit* t)
117 | 		     id)))))
118 |       (number token))))
119 | 
120 | (defun ids-for-tokens ( tokens )
121 |   (mapcar #'id-for-token tokens))
122 | 
123 | (defun ids-for-string ( string )
124 |   (ids-for-tokens (extract-words (mvretn 3 (tokenize-string string)))))
125 | 
126 | (defun token-for-id ( id )
127 |   "Return a string token for a given token id"
128 |   (error-on-null
129 |    (aif (hash-get *token-table* id)
130 | 	it
131 | 	(aif (token-for-id-hook id)
132 | 	     it
133 | 	     nil))
134 |    "Unknown token for id: ~A." id))
135 | 
136 | (defun tokens-for-ids ( ids )
137 |   "Return a list of string tokens for each id in ids"
138 |   (mapcar #'token-for-id ids))
139 | 
140 | (defun string->token-array ( string )
141 |   (let ((words (extract-words
142 | 		(mvretn 3 (tokenize-string 
143 | 			   (concatenate 'string string " EOF"))))))
144 |     (make-array (1- (length words)) :element-type 'fixnum 
145 | 		:initial-contents (mapcar #'id-for-token (butlast words)))))
146 | 
147 | 
148 | ;;
149 | ;; Hook in an external source of storage
150 | ;;
151 | 
152 | (defun add-external-mapping (id-for-token token-for-id add-to-map token-counter)
153 |   (setq *external-token-map* t)
154 |   (setq *id-for-token-hook* id-for-token)
155 |   (setq *token-for-id-hook* token-for-id)
156 |   (setq *add-to-map-hook* add-to-map)
157 |   (setq *token-counter-hook* token-counter))
158 | 
159 | (defun id-for-token-hook (token)
160 |   (when *id-for-token-hook*
161 |     (funcall *id-for-token-hook* token)))
162 | 
163 | (defun token-for-id-hook (id)
164 |   (when *token-for-id-hook*
165 |     (funcall *token-for-id-hook* id)))
166 | 
167 | (defun add-to-map-hook (token id)
168 |   (when *add-to-map-hook*
169 |     (funcall *add-to-map-hook* token id)))
170 | 
171 | (defun token-counter-hook ()
172 |   (funcall *token-counter-hook*))
173 | 
174 | ;; Loading map from file
175 | 
176 | #|
177 | (defparameter *default-token-map-file-int* 
178 |   (translate-logical-pathname "think:data;lang;en;langutils;initial-token-map.sexp"))
179 | 
180 | (defun save-token-map (filename)
181 |   "This uses the serialize-sexp library to save the token 
182 |    database hashes and counter to a disk file"
183 |   (write-log lexicon-init "Saving token map to: ~A" filename)
184 |   (let ((save-to
185 | 	 (cond ((not (null filename)) filename)
186 | 	       (t *default-token-map-file-int*))))
187 |     (with-open-file (s save-to :direction :output :if-exists :supersede)
188 |       (serialize-sexp 
189 |        (list 
190 | 	*token-counter*
191 | 	*token-table*
192 | 	*id-table*)
193 |        s))
194 |     t))
195 | 
196 | (defun load-token-map (filename)
197 |   "Load an s-serialized version of the token map data 
198 |    structures from a file"
199 |   (write-log lexicon-init "Loading and initializing token map from: ~A" filename)
200 |   (cond ((and (null filename) (null *default-token-map-file-int*))
201 | 	 (reset-token-counts))
202 | 	((null filename)
203 | 	 (if (probe-file *default-token-map-file-int*)
204 | 	     (progn (setf *tokens-load-file* filename)
205 | 		    (load-token-map *default-token-map-file-int*))
206 | 	   (reset-token-counts)))
207 | 	(t (with-open-file (s filename )
208 | 	     (setf *tokens-load-file* filename)
209 | 	     (let ((list (deserialize-sexp s)))
210 | 	       (setf *token-counter* (first list))
211 | 	       (setf *token-table* (second list))
212 | 	       (setf *id-table* (third list))))
213 | 	   t)))
214 | |#	     
215 | 
216 | ;; ========================================
217 | ;; FISHY TOKENS
218 | 
219 | (defmethod suspicious-word? ((word fixnum))
220 |   "Find a suspicious word using it's token id"
221 |   (hash-get *suspicious-words* word))
222 | 
223 | (defun suspicious-string? (string)
224 |   "Determine if the alpha-num and number balance is reasonable
225 |    for lingustic processing or if non-alpha-nums are present"
226 |   (let ((nums 0)
227 | 	(others 0))
228 |     (declare (type fixnum nums others)
229 | ;;	     (inline alpha-char-p digit-char-p)
230 | 	     (optimize speed (safety 1)))
231 |     (loop for it across string do
232 |       (cond ((alpha-char-p it) nil)
233 | 	    ((digit-char-p it) (incf nums))
234 | 	    (t (incf others))))
235 |     (or (> nums *max-token-nums*)
236 | 	(> others *max-token-others*))))


--------------------------------------------------------------------------------
/src/vector-keyed-table.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          vector-keyed-table.lisp
  6 | ;;;; Purpose:       Fast vector element hash implementation
  7 | ;;;; Programmer:    Ian S. Eslick
  8 | ;;;; Date Started:  August 2004
  9 | ;;;;
 10 | 
 11 | (in-package :langutils)
 12 | 
 13 | ;; ----------------------------------------------------------
 14 | ;; VECTOR-HASH -- Vector Keyed Hash Trees
 15 | ;;
 16 | ;; Tree of hashes for fast lookup of integer simple-vectors keys
 17 | ;; over very large vector spaces
 18 | ;;
 19 | ;; Assumptions:
 20 | ;; * Vector elements are positive integers 1...n
 21 | ;; * You are loading thousands of elements (expensive otherwise)
 22 | ;; * Sparsely populated tree 
 23 | ;; ----------------------------------------------------------
 24 | 
 25 | (defparameter *vector-keyed-threshold* 5
 26 |   "When to switch internally from assoc list to hash table")
 27 | 
 28 | (defparameter *vechash-threshold* 20)
 29 | 
 30 | (defclass-exported vector-keyed-table (table)
 31 |   ((root :accessor table-root 
 32 | 	 :initarg :root)
 33 |    (threshold :reader vhash-threshold
 34 | 	      :initarg :threshold
 35 | 	      :initform *vechash-threshold*)
 36 |    (count :accessor table-count
 37 | 	  :initform 0)))
 38 | 
 39 | (defmethod-exported initialize-instance :after ((table vector-keyed-table) &rest initargs &key &allow-other-keys)
 40 |   (clear table))
 41 | 
 42 | ;;
 43 | ;; ----------------------
 44 | ;;
 45 | 
 46 | (defparameter *count-updated* nil)
 47 | 	   
 48 | (defun make-vknode (&optional value table)
 49 |   (cons value table))
 50 | 
 51 | (defun vknode-value (node) 
 52 |   (car node))
 53 | 
 54 | (defsetf vknode-value rplaca)
 55 | 
 56 | (defun vknode-table (node) 
 57 |   (cdr node))
 58 | 
 59 | (defsetf vknode-table rplacd)
 60 | 
 61 | (defun end-of-vkey (vkey index) 
 62 |   (= (length vkey) (1+ index)))
 63 | 
 64 | (defun extend-vktable (table key)
 65 |   "Add a vk node to the table for key"
 66 |   (add table key (make-vknode)))
 67 | 
 68 | (defun upgrade-vknode (node)
 69 |   "Upgrade vktable entry from assoc to hashed when the size
 70 |    exceeds the vk threshold.  Do this to the table in the
 71 |    provided vknode. Also downgrade (is this a good idea as
 72 |    it may cause thrashing?)"
 73 |   (when node
 74 |     (let ((ctable (vknode-table node)))
 75 |       (when ctable
 76 | 	(cond ((and (subtypep (type-of ctable) 'assoc-table)
 77 | 		    (>= (size-of ctable) *vector-keyed-threshold*))
 78 | 	       (setf (vknode-table node) (convert ctable 'hashed-table))))))))
 79 | ;;	      ((and (subtypep (type-of ctable) 'hashed-table)
 80 | ;;		    (< (size-of ctable) *vector-keyed-threshold*))
 81 | ;;	       (setf (vknode-table node) (convert ctable 'assoc-table))))))))
 82 | 
 83 | (defun ensure-vktable (node)
 84 |   "Ensure that the provided node has a proper table for 
 85 |    the next recusion of vector-keyed-put"
 86 |   (aif-ret (vknode-table node)
 87 | 	   (setf (vknode-table node) (make-instance 'assoc-table))
 88 | 	   (vknode-table node)))
 89 | 
 90 | (defun vktable-get-extend (vktable node key &aux (table (ensure-vktable node)))
 91 |   "Get node from table.  If new node, update vktable item count
 92 |    add a new node to the table for key.  If table has exceeded
 93 |    size, upgrade it to a hashed table and return the new node"
 94 |   (aif-ret (get-value table key)
 95 | 	   (unless *count-updated* 
 96 | 	     (incf (table-count vktable))
 97 | 	     (setq *count-updated* t))
 98 | 	   (extend-vktable table key)
 99 | 	   (upgrade-vknode node)
100 | 	   (get-value (vknode-table node) key)))
101 | 
102 | (defun vector-keyed-put (vktable vkey value)
103 |   "Internal recursion to walk tree and add or modify a value for
104 |    vector key: vkey"
105 |   (setf *count-updated* nil)
106 |   (labels ((rec (node index)
107 | 	     (let* ((key (aref vkey index))
108 | 		    (nextn (vktable-get-extend vktable node key)))
109 | 	       (if (end-of-vkey vkey index)
110 | 		   (progn (setf (vknode-value nextn) value) value)
111 | 		   (rec nextn (1+ index))))))
112 |     (rec (make-vknode nil (table-root vktable)) 0)))
113 | 
114 | (defun drop-vknode-value (tnode stack)
115 |   "Clear value in target node (tnode) and if 
116 |    alist is size zero or nil, delete entry in 
117 |    parent table (snode) and, if zero, also delete"
118 |    (if (or (null (vknode-table tnode))
119 | 	   (empty (vknode-table tnode)))
120 |        (dbind (key . nextn) (car stack)
121 | 	      (drop-vktable-entry key nextn (cdr stack)))
122 |        (setf (vknode-value tnode) nil)))
123 |    
124 | (defun drop-vktable-entry (key node stack)
125 |   (drop (vknode-table node) key)
126 |   (when (and stack 
127 | 	     (empty (vknode-table node))
128 | 	     (null (vknode-value node)))
129 |     (dbind (key . nextn) (car stack)
130 | 	   (drop-vktable-entry key nextn (cdr stack)))))
131 | 
132 | (defun vector-keyed-rem (vktable vkey)
133 |   "Remove a vector keyed value from the vktable
134 |    and clean up any empty nodes or tables created
135 |    thereby.  Also decrement the count"
136 |   (let ((stack nil))
137 |     (labels ((rec (node index)
138 | 	       (when node
139 | 		 (let* ((key (aref vkey index))
140 | 			(table (vknode-table node))
141 | 			(nextn (when table (get-value table key))))
142 | 		   (push (cons key node) stack)
143 | 		   (when nextn
144 | 		     (if (end-of-vkey vkey index)
145 | 			 (progn
146 | 			   (drop-vknode-value nextn stack)
147 | 			   (decf (table-count vktable))
148 | 			   (when (empty vktable)
149 | 			     (clear vktable))
150 | 			   t)
151 | 			 (rec nextn (1+ index))))))))
152 |       (rec (make-vknode nil (table-root vktable)) 0))))
153 | 
154 | (defun vector-keyed-get (vktable vkey)
155 |   "Internal recursion to walk tree and return value for vector
156 |    key: vkey"
157 |   (labels ((rec (table index)
158 | 	     (when table
159 | 	       (awhen (get-value table (aref vkey index))
160 | 		 (if (end-of-vkey vkey index)
161 | 		     (vknode-value it)
162 | 		     (rec (vknode-table it) (1+ index)))))))
163 |     (rec (table-root vktable) 0)))
164 | 
165 | ;;
166 | ;; ----------------------
167 | ;;
168 | 	   
169 | (defmethod get-value ((table vector-keyed-table) key)
170 |   (assert (subtypep (type-of key) 'array))
171 |   (vector-keyed-get table key))
172 | 
173 | (defmethod (setf get-value) (value (table vector-keyed-table) key)
174 |   (assert (subtypep (type-of key) 'array))
175 |   (vector-keyed-put table key value))
176 | 
177 | (defmethod drop ((table vector-keyed-table) key)
178 |   (assert (subtypep (type-of key) 'array))
179 |   (vector-keyed-rem table key))
180 | 
181 | (defmethod clear ((table vector-keyed-table))
182 |   (setf (table-root table) 
183 | 	(make-instance 'hashed-table
184 | 		       :hash (make-hash-table :test #'eq :size 1000 :rehash-size 1.5 :rehash-threshold 0.7)))
185 |   (setf (table-count table) 0)
186 |   t)
187 | 
188 | (defmethod size-of ((table vector-keyed-table))
189 |   (table-count table))
190 | 
191 | (defmethod storage-allocated ((table vector-keyed-table))
192 |   ;; NOTE: TODO
193 |   )
194 | 
195 | (defclass vector-keyed-table-iterator (iterator)
196 |   ((reference :accessor reference :initarg :reference)
197 |    (type :accessor iter-type :initarg :type)
198 |    (last :accessor last-key :initform nil)
199 |    (stack :accessor vkti-stack :initform nil)))
200 | 
201 | (defmethod get-iterator ((vktable vector-keyed-table) &key (type :pair))
202 |   (make-instance 'vector-keyed-table-iterator
203 | 		 :reference vktable
204 | 		 :type type))
205 | 
206 | (defmethod initialize-instance :after ((iter vector-keyed-table-iterator) &rest initargs &key &allow-other-keys)
207 |   (reset iter))
208 | 
209 | (defmacro mvpass2 (form)
210 |   `(aif2 ,form
211 | 	 (values it t)
212 | 	 (values nil nil)))
213 | 
214 | (defmethod next-value ((iter vector-keyed-table-iterator))
215 |   "Invariant: stack always contains a null, exhausted or intermediate table iterator"
216 |   (with-slots (stack) iter
217 |     (cond ((null stack)
218 | 	   (values nil nil))
219 | 	  ((or (null (car stack))
220 | 	       (not (next-p (car stack))))
221 | 	   (pop stack)
222 | 	   (pop (last-key iter))
223 | 	   (mvpass2 (next-value iter)))
224 | 	  (t 
225 | 	   (mvpass2 (vkti-next-value iter))))))
226 | 
227 | (defun vkti-next-value (iter)
228 |   (with-slots (stack) iter
229 |     (let* ((iterator (car stack))
230 | 	   (kvpair (next-value iterator))
231 | 	   (key (car kvpair))
232 | 	   (node (cdr kvpair)))
233 |       (push key (last-key iter))
234 |       (aif (vknode-table node)
235 | 	   (push (get-iterator it) stack)
236 | 	   (push nil stack))
237 |       (aif (vknode-value node)
238 | 	   (values (extract-assoc-type (cons (list->array (reverse (last-key iter))) it) 
239 | 				       (iter-type iter))
240 | 		   t)
241 | 	   (mvpass2 (next-value iter))))))
242 | 
243 | (defmethod-exported next-p ((iter vector-keyed-table-iterator))
244 |   (with-slots (stack) iter
245 |     (and stack
246 | 	 (not (every (lambda (iter)
247 | 		       (or (null iter) (not (next-p iter))))
248 | 		     stack)))))
249 | 
250 | (defmethod-exported drop-last ((iter vector-keyed-table-iterator))
251 |   (awhen (last-key iter)
252 |     (drop (reference iter) (list->array (reverse it)))
253 |     (setf (last-key iter) nil)))
254 | 
255 | (defmethod-exported reset ((iter vector-keyed-table-iterator))
256 |   (setf (vkti-stack iter)
257 | 	(list (get-iterator (table-root (reference iter))))))
258 |  
259 | (defmethod-exported clear ((iter vector-keyed-table-iterator))
260 |   (setf (vkti-stack iter) nil))
261 | 
262 | 


--------------------------------------------------------------------------------
/src/vectors.lisp:
--------------------------------------------------------------------------------
  1 | ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: utils -*-
  2 | ;;;; *************************************************************************
  3 | ;;;; FILE IDENTIFICATION
  4 | ;;;;
  5 | ;;;; Name:          vectors
  6 | ;;;; Purpose:       Macro system for compiling efficient vector match & side effect rules
  7 | ;;;;
  8 | ;;;; Programmer:    Ian S. Eslick
  9 | ;;;; Date Started:  October 2004
 10 | ;;;;
 11 | 
 12 | (in-package :langutils)
 13 | #|
 14 | - Match pattern to vector(s)
 15 |   - integer or byte-level pattern matching
 16 |   - Can figure which rule to apply in parallel
 17 |   - Can operate on more than one array at a time
 18 | - A matching rule results in a side effect
 19 |   - a procedure call with
 20 |     - source data & focal point
 21 |     - current focus only
 22 |     - all element matching pattern
 23 |   - a direct side effect on the vector
 24 | 
 25 | Dictionary for resolving symbolic labels to integers
 26 | |#
 27 | 
 28 | ;; ------------------------------------
 29 | ;;          VECTOR GROUPS
 30 | ;; ------------------------------------
 31 | 
 32 | (defvar *vector-match-groups* (hash :test #'equal))
 33 | 
 34 | (defstruct vector-group
 35 |   "A vector group is a set of applicable rules
 36 |    to match against a source vector.  Groups can
 37 |    be compiled to produce efficient match procedures."
 38 |   (name :type string)
 39 |   vectors               ;; alist of alists; vector names & attributes
 40 |   (rules :type hash)    ;; rule name to rule mapping
 41 |   (stats :type hash)    ;; keeps stats on rule use frequency
 42 |   (dirty :type boolean) ;; whether new rules have been added
 43 |   match-proc)           ;; the match procedure for this group
 44 | 
 45 | (defun get-group (name)
 46 |   (hash-get *vector-match-groups* name))
 47 | 
 48 | (defun vector-group-vector-names (group)
 49 |   (mapcar #'car
 50 | 	  (vector-group-vectors group)))
 51 | 
 52 | (defun vector-group-vec-type (gname vname)
 53 |   (cdr (assoc :type (cdr (assoc vname (vector-group-vectors (get-group gname)))))))
 54 | 
 55 | (defun vector-group-vec-dict (gname vname)
 56 |   (cdr (assoc :dict (cdr (assoc vname (vector-group-vectors (get-group gname)))))))
 57 | 
 58 | (defmacro-exported undef-vector-group (name)
 59 |   `(hash-rem *vector-match-groups* ',name))
 60 | 
 61 | (defmacro-exported def-vector-group (name &rest vector-descriptions)
 62 |   (let ((group (gensym)))
 63 |   `(eval-when (compile eval load)
 64 |      (let ((,group (make-vector-group-name :name ',(localize-symbol name) :rules (hash) :stats (hash))))
 65 |        (hash-put *vector-match-groups* ',(localize-symbol name) ,group)
 66 |        (setf (vector-group-vectors ,group)
 67 | 	     ',(mapcar #'(lambda (vector) 
 68 | 			   (if (listp vector)
 69 | 			       (destructuring-bind (name &key type dict) vector
 70 | 				 `(,name . ((:type . ,type) (:dict . ,dict))))
 71 | 			     `(,vector nil)))
 72 | 		       vector-descriptions))
 73 |        ',name))))
 74 | 
 75 | (def-vector-group tagged-text
 76 |   (tokens :type (simple-vector fixnum)
 77 | 	  :dict #'id-for-token)
 78 |   (tags :type (simple-vector unsigned-byte)
 79 | 	:dict #'if-for-tag))
 80 | 
 81 | ;; ------------------------------------
 82 | ;;       VECTOR MATCH PATTERNS
 83 | ;; ------------------------------------
 84 | 
 85 | (def-vector-pattern assoc-left contextual-rules
 86 |   "Name/purpose of rule"
 87 |   (tags   (PP JJ nil *NN nil nil NN))
 88 |   (tokens (          *foo))
 89 |   -> 
 90 |   '(setf (svref tags focus) (svref tags (- focus 1))))
 91 | ;;  (match #'capture-matches)
 92 | ;;  (pattern #'store-pattern)
 93 | ;;  (focal #'capture-target))
 94 | 
 95 | (defstruct vector-pattern filter-proc result-proc patterns)
 96 |   
 97 | 
 98 | (defmacro-exported def-vector-pattern (vname gname &rest productions)
 99 |   (with-gensyms (gsym vsym)
100 |     (unless (get-group gname) (error "Group ~A is not defined.~%" gname))
101 |     (mvbind (patterns consequent) (split-list '-> productions)
102 |       `(let ((,gsym (get-group ,gname)))
103 | 	 (when (null patterns) (error "No patterns found.~%"))
104 | 	 (hash-put (vector-group-rules ,gsym)
105 | 		   ,name
106 | 		   ,(construct-make-vp (get-group gname) patterns consequent))))))
107 | 		   
108 | (defun construct-make-vp (group patterns consequent)
109 |   (make-vector-pattern 
110 |    :filter-proc #'(lambda ,(vector-group-vector-names (get-group group))
111 | ;;		    (declare (inline match-vectors))
112 | 		    ,@(get-vector-declarations group
113 | 					       (and ,@(mapcar #'(lambda (vsym)
114 | 								  `(match-vectors ,vname (cdr (assoc ,vname ,patterns))))))))
115 |    :result-proc #'(lambda ,(vector-group-vector-names (get-group group))
116 | 		    ,@consequent)
117 |    :patterns ,patterns))
118 | 
119 | (make-vector-pattern
120 |  :filter-proc #'(lambda (
121 | 
122 | (vector-match group (tokens tags) [(start end)|pos]
123 | 	      :mode [:compiled|:iterate])
124 | 
125 | ;; Compiler
126 | 
127 | Can apply rules one at a time (debug), default is to compile if not compiled
128 | 
129 | Want to minimize the # of rules applicable after every decision point
130 | Build a tree of decision points, weighed by any frequency data
131 | emit a decision tree
132 | bottom runs code or calls function
133 | 
134 | (defun find-best-strategy ()
135 |   "Returns an tree of positions to check that minimizes total, weighted rule cost"
136 |   )
137 | 
138 | (defun build-decision-tree (decision-tree)
139 |   "Constructs a code sequence to implement the decision tree"
140 |   )
141 | 
142 | (defun compile-group-rules (group)
143 |   "Returns a procedure that applies all rules to the
144 |    structures defined in group"
145 |   (compile (build-decision-tree
146 | 
147 | (defun generate-production-call (expr)
148 |   "Generates the code for a call
149 |   
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------