├── src ├── _version.js ├── 03-three │ ├── numbers │ │ ├── plugin.js │ │ ├── find.js │ │ ├── parse │ │ │ ├── _data.js │ │ │ ├── index.js │ │ │ └── fromText.js │ │ ├── format │ │ │ ├── index.js │ │ │ └── toText.js │ │ └── data.js │ ├── topics │ │ ├── plugin.js │ │ └── api.js │ ├── contractions │ │ ├── plugin.js │ │ └── api.js │ ├── nouns │ │ ├── plugin.js │ │ └── api.js │ ├── adjectives │ │ ├── plugin.js │ │ └── api.js │ └── verbs │ │ ├── plugin.js │ │ └── api │ │ ├── adverbs.js │ │ ├── parse.js │ │ ├── toJSON.js │ │ └── find.js ├── 02-two │ ├── preTagger │ │ ├── methods │ │ │ ├── index.js │ │ │ └── guessGender.js │ │ ├── compute │ │ │ ├── 2nd-pass │ │ │ │ ├── noun-fallback.js │ │ │ │ ├── neighbours.js │ │ │ │ ├── suffix-lookup.js │ │ │ │ └── acronym.js │ │ │ ├── 3rd-pass │ │ │ │ ├── fix-contractions.js │ │ │ │ ├── adj-plurals.js │ │ │ │ ├── number-types.js │ │ │ │ ├── noun-gender.js │ │ │ │ ├── noun-plurals.js │ │ │ │ ├── adj-gender.js │ │ │ │ ├── verb-tense.js │ │ │ │ └── verb-form.js │ │ │ ├── 1st-pass │ │ │ │ ├── titlecase.js │ │ │ │ ├── regex.js │ │ │ │ └── year.js │ │ │ └── index.js │ │ ├── model │ │ │ ├── index.js │ │ │ ├── regex │ │ │ │ ├── regex-text.js │ │ │ │ ├── regex-normal.js │ │ │ │ └── regex-numbers.js │ │ │ └── suffixes.js │ │ ├── plugin.js │ │ └── tagRank.js │ ├── postTagger │ │ ├── plugin.js │ │ └── matches.js │ └── tagset │ │ ├── plugin.js │ │ └── tags │ │ ├── values.js │ │ ├── dates.js │ │ ├── misc.js │ │ ├── nouns.js │ │ └── verbs.js ├── _lib.js ├── 01-one │ ├── lexicon │ │ ├── methods │ │ │ ├── index.js │ │ │ ├── model.js │ │ │ ├── noun │ │ │ │ └── index.js │ │ │ ├── adjective │ │ │ │ └── index.js │ │ │ └── verb │ │ │ │ └── index.js │ │ ├── plugin.js │ │ ├── model │ │ │ ├── misc.js │ │ │ └── lexicon.js │ │ └── compute │ │ │ └── root.js │ └── tokenize │ │ ├── plugin.js │ │ ├── compute │ │ ├── index.js │ │ └── machine.js │ │ ├── contractions.js │ │ └── unicode.js └── index.js ├── data ├── lexicon │ ├── misc │ │ ├── determiners.js │ │ ├── conjunctions.js │ │ ├── prepositions.js │ │ ├── expressions.js │ │ ├── currencies.js │ │ └── adverbs.js │ ├── dates │ │ ├── dates.js │ │ ├── weekdays.js │ │ └── months.js │ ├── nouns │ │ ├── feminine.js │ │ ├── pronouns.js │ │ ├── possessives.js │ │ ├── uncountables.js │ │ ├── masculine.js │ │ └── sportsTeams.js │ ├── numbers │ │ ├── ordinals.js │ │ ├── cardinals.js │ │ └── units.js │ ├── people │ │ ├── firstnames.js │ │ ├── honorifics.js │ │ └── people.js │ ├── misc.js │ ├── places │ │ ├── places.js │ │ └── regions.js │ └── index.js └── models │ ├── _lint.js │ └── index.js ├── plugins └── dates │ ├── src │ ├── phrase │ │ ├── date │ │ │ ├── 01-date.js │ │ │ ├── 02-year.js │ │ │ ├── 03-misc.js │ │ │ ├── data.js │ │ │ ├── units.js │ │ │ └── index.js │ │ ├── normalize.js │ │ └── index.js │ ├── plugin.js │ ├── toJson.js │ ├── find.js │ └── api.js │ ├── tests │ ├── _lib.js │ ├── backburner │ │ ├── ambig-weekday.ignore.js │ │ ├── equals.ignore.js │ │ ├── to-iso.ignore.js │ │ └── ambig-month.ignore.js │ └── dates.test.js │ ├── README.md │ ├── rollup.config.js │ ├── index.d.ts │ ├── scratch.js │ └── package.json ├── .gitignore ├── tmp.js ├── learn ├── giga │ ├── makeModel.js │ ├── french.js │ ├── _giga.js │ ├── getList.js │ ├── getPairs.js │ ├── corpus.js │ └── test.js ├── adjectives │ └── learn.js ├── wiktionary │ ├── add.js │ └── index.js ├── wikinews │ ├── packSuffixes.js │ ├── parse.js │ ├── getLexicon.js │ └── getSuffix.js ├── nouns │ └── learn.js ├── wolf │ └── parse_wolf.js └── verbs │ ├── old.js │ ├── toPairs.js │ ├── single-pairs.js │ └── learn.js ├── scripts ├── version.js ├── stress.js ├── cleanup.js ├── pack.js └── types.ts ├── tests ├── _lib.js ├── buildNet.test.js ├── conjugate.test.js └── numbers │ ├── ordinal.test.js │ └── number-misc.test.js ├── add-verbs.js ├── rollup.config.js ├── .esformatter ├── changelog.md ├── LICENSE ├── .github └── workflows │ └── build-and-test.yml ├── .eslintrc ├── package.json ├── types ├── view │ └── fr.ts ├── index.d.ts └── misc.ts └── scratch.js /src/_version.js: -------------------------------------------------------------------------------- 1 | export default '0.2.8' -------------------------------------------------------------------------------- /src/03-three/numbers/plugin.js: -------------------------------------------------------------------------------- 1 | import api from './api.js' 2 | 3 | export default { 4 | api 5 | } -------------------------------------------------------------------------------- /src/03-three/topics/plugin.js: -------------------------------------------------------------------------------- 1 | import api from './api.js' 2 | 3 | export default { 4 | api 5 | } -------------------------------------------------------------------------------- /data/lexicon/misc/determiners.js: -------------------------------------------------------------------------------- 1 | export default ['le', 'la', 'les', 'au', 'aux', 'ol', 'un', 'une'] 2 | -------------------------------------------------------------------------------- /plugins/dates/src/phrase/date/01-date.js: -------------------------------------------------------------------------------- 1 | const parse = function () { 2 | 3 | } 4 | export default parse -------------------------------------------------------------------------------- /plugins/dates/src/phrase/date/02-year.js: -------------------------------------------------------------------------------- 1 | const parse = function () { 2 | 3 | } 4 | export default parse -------------------------------------------------------------------------------- /plugins/dates/src/phrase/date/03-misc.js: -------------------------------------------------------------------------------- 1 | const parse = function () { 2 | 3 | } 4 | export default parse -------------------------------------------------------------------------------- /src/03-three/contractions/plugin.js: -------------------------------------------------------------------------------- 1 | import api from './api.js' 2 | 3 | export default { 4 | api 5 | } -------------------------------------------------------------------------------- /src/03-three/nouns/plugin.js: -------------------------------------------------------------------------------- 1 | import api from './api.js' 2 | 3 | export default { 4 | api, 5 | } 6 | -------------------------------------------------------------------------------- /src/03-three/adjectives/plugin.js: -------------------------------------------------------------------------------- 1 | import api from './api.js' 2 | 3 | export default { 4 | api, 5 | } 6 | -------------------------------------------------------------------------------- /src/03-three/verbs/plugin.js: -------------------------------------------------------------------------------- 1 | import api from './api/api.js' 2 | 3 | export default { 4 | api, 5 | } 6 | -------------------------------------------------------------------------------- /data/lexicon/dates/dates.js: -------------------------------------------------------------------------------- 1 | // uncontroversial date words 2 | export default ['aujourd\'hui', 'demain', 'hier', 'weekend'] 3 | -------------------------------------------------------------------------------- /data/lexicon/misc/conjunctions.js: -------------------------------------------------------------------------------- 1 | export default ['et', 'mais', 'soit', 'puis', 'car', 'voire', 'sinon', 'comme', 'donc'] 2 | -------------------------------------------------------------------------------- /data/lexicon/nouns/feminine.js: -------------------------------------------------------------------------------- 1 | export default ['confiture', 'géologie', 'librairie', 'ambulance', 'poule', 'rue', 'lutte'] 2 | -------------------------------------------------------------------------------- /src/02-two/preTagger/methods/index.js: -------------------------------------------------------------------------------- 1 | import guessGender from './guessGender.js' 2 | export default { one: { guessGender } } 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | build/ 3 | .DS_Store 4 | coverage 5 | wolf-1.0b4.xml 6 | wikinews.txt 7 | /learn/giga/results/*.js 8 | learn/scrape/* -------------------------------------------------------------------------------- /src/02-two/postTagger/plugin.js: -------------------------------------------------------------------------------- 1 | import postTagger from './matches.js' 2 | 3 | export default { 4 | compute: { 5 | postTagger 6 | }, 7 | hooks: ['postTagger'] 8 | } -------------------------------------------------------------------------------- /src/_lib.js: -------------------------------------------------------------------------------- 1 | // console.log('local-path') 2 | // import nlp from '/Users/spencer/mountain/compromise/src/one.js' 3 | import nlp from 'compromise/one' 4 | export default nlp -------------------------------------------------------------------------------- /src/01-one/lexicon/methods/index.js: -------------------------------------------------------------------------------- 1 | import adjective from './adjective/index.js' 2 | import noun from './noun/index.js' 3 | import verb from './verb/index.js' 4 | 5 | export default { adjective, noun, verb } 6 | -------------------------------------------------------------------------------- /tmp.js: -------------------------------------------------------------------------------- 1 | import verbs from './data/models/verb/present-tense.js' 2 | import lex from './data/lexicon/index.js' 3 | Object.keys(verbs).forEach(k => { 4 | if (!lex[k]) { 5 | console.log(k) 6 | } 7 | }) 8 | 9 | -------------------------------------------------------------------------------- /data/lexicon/nouns/pronouns.js: -------------------------------------------------------------------------------- 1 | // are these right? 2 | export default [ 3 | 'il', 4 | 'c', 5 | 'elle', 6 | 'on', 7 | 'ils', 8 | 'nous', 9 | 'je', 10 | 'ce', 11 | 'j', 12 | 'elles', 13 | 'vous', 14 | 'tu', 15 | 't', 16 | 'moi', 17 | ] 18 | -------------------------------------------------------------------------------- /learn/giga/makeModel.js: -------------------------------------------------------------------------------- 1 | import data from './results/plural-sing.js' 2 | import { learn, compress, test, validate } from 'suffix-thumb' 3 | 4 | const pairs = validate(data) 5 | test(pairs) 6 | const model = learn(pairs) 7 | console.log(JSON.stringify(model, null, 2)) 8 | 9 | -------------------------------------------------------------------------------- /data/models/_lint.js: -------------------------------------------------------------------------------- 1 | import model from './verb/present-tense.js' 2 | 3 | Object.keys(model).forEach(k => { 4 | let s = new Set() 5 | model[k].slice(1).forEach(str => { 6 | if (s.has(str)) { 7 | console.log(k, str) 8 | } 9 | s.add(str) 10 | }) 11 | }) -------------------------------------------------------------------------------- /scripts/version.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | // avoid requiring our whole package.json file 3 | // make a small file for our version number 4 | let pkg = JSON.parse(fs.readFileSync('./package.json').toString()) 5 | 6 | fs.writeFileSync('./src/_version.js', `export default '${pkg.version}'`) 7 | -------------------------------------------------------------------------------- /tests/_lib.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-console */ 2 | import build from '../builds/fr-compromise.mjs' 3 | import src from '../src/index.js' 4 | let nlp = src 5 | if (process.env.TESTENV === 'prod') { 6 | console.warn('== production build test 🚀 ==') 7 | nlp = build 8 | } 9 | export default nlp 10 | -------------------------------------------------------------------------------- /plugins/dates/src/plugin.js: -------------------------------------------------------------------------------- 1 | import api from './api.js' 2 | 3 | let lexicon = { 4 | heir: 'Date', 5 | soir: 'Date', 6 | nuit: 'Date', 7 | 'soirée': 'Date', 8 | matin: 'Date', 9 | 'après midi': 'Date', 10 | semaine: 'Duration', 11 | } 12 | 13 | export default { 14 | words: lexicon, 15 | api, 16 | } -------------------------------------------------------------------------------- /src/01-one/lexicon/plugin.js: -------------------------------------------------------------------------------- 1 | import methods from './methods/index.js' 2 | import words from './model/lexicon.js' 3 | import root from './compute/root.js' 4 | 5 | export default { 6 | methods: { 7 | two: { 8 | transform: methods 9 | } 10 | }, 11 | words, 12 | compute: { 13 | root: root 14 | } 15 | } -------------------------------------------------------------------------------- /src/01-one/tokenize/plugin.js: -------------------------------------------------------------------------------- 1 | import unicode from './unicode.js' 2 | import contractions from './contractions.js' 3 | import compute from './compute/index.js' 4 | 5 | 6 | export default { 7 | mutate: (world) => { 8 | world.model.one.unicode = unicode 9 | 10 | world.model.one.contractions = contractions 11 | }, 12 | compute 13 | } -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/2nd-pass/noun-fallback.js: -------------------------------------------------------------------------------- 1 | const nounFallback = function (terms, i, world) { 2 | let setTag = world.methods.one.setTag 3 | let term = terms[i] 4 | if (term.tags.size === 0) { 5 | setTag([term], 'Noun', world, false, 'fallback') 6 | return true 7 | } 8 | return null 9 | } 10 | export default nounFallback -------------------------------------------------------------------------------- /src/02-two/tagset/plugin.js: -------------------------------------------------------------------------------- 1 | import nouns from './tags/nouns.js' 2 | import verbs from './tags/verbs.js' 3 | import values from './tags/values.js' 4 | import dates from './tags/dates.js' 5 | import misc from './tags/misc.js' 6 | 7 | let tags = Object.assign({}, nouns, verbs, values, dates, misc) 8 | 9 | export default { 10 | tags 11 | } 12 | -------------------------------------------------------------------------------- /src/02-two/preTagger/model/index.js: -------------------------------------------------------------------------------- 1 | import regexNormal from './regex/regex-normal.js' 2 | import regexNumbers from './regex/regex-numbers.js' 3 | import regexText from './regex/regex-text.js' 4 | import suffixPatterns from './suffixes.js' 5 | 6 | 7 | export default { 8 | regexNormal, 9 | regexNumbers, 10 | regexText, 11 | suffixPatterns 12 | } 13 | -------------------------------------------------------------------------------- /data/lexicon/nouns/possessives.js: -------------------------------------------------------------------------------- 1 | // are these right? 2 | export default ['en', 'lui', 'nous', 'leur', 'm', 'me', 'vous', 'te', 'toi', 'ce', 3 | 4 | 'mon', 'ma', 'mes',// my 5 | 'ton', 'ta', 'tes',// your 6 | 'son', 'sa', 'ses',// his 7 | 'notre', 'notre', 'nos',// our 8 | 'votre', 'votre', 'vos',// your 9 | 'leur', 'leur', 'leurs',// their 10 | ] 11 | -------------------------------------------------------------------------------- /src/01-one/lexicon/methods/model.js: -------------------------------------------------------------------------------- 1 | import { uncompress } from 'suffix-thumb' 2 | import packed from './_data.js' 3 | 4 | // uncompress them 5 | let model = Object.keys(packed).reduce((h, k) => { 6 | h[k] = {} 7 | Object.keys(packed[k]).forEach(form => { 8 | h[k][form] = uncompress(packed[k][form]) 9 | }) 10 | return h 11 | }, {}) 12 | 13 | export default model -------------------------------------------------------------------------------- /src/02-two/preTagger/plugin.js: -------------------------------------------------------------------------------- 1 | import preTagger from './compute/index.js' 2 | import tagRank from './tagRank.js' 3 | import model from './model/index.js' 4 | import methods from './methods/index.js' 5 | 6 | 7 | export default { 8 | compute: { 9 | preTagger, 10 | tagRank 11 | }, 12 | methods, 13 | model: { 14 | two: model 15 | }, 16 | hooks: ['preTagger'] 17 | } -------------------------------------------------------------------------------- /data/lexicon/dates/weekdays.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'lundi', // - Monday. 3 | 'mardi', // - Tuesday. 4 | 'mercredi', // - Wednesday. 5 | 'jeudi', // - Thursday. 6 | 'vendredi', // - Friday. 7 | 'samedi', // - Saturday. 8 | 'dimanche', // - Sunday. 9 | 'lun', // 10 | 'mar', // 11 | 'mer', // 12 | 'jeu', // 13 | 'ven', // 14 | 'sam', // 15 | 'dim', // 16 | ] 17 | -------------------------------------------------------------------------------- /plugins/dates/tests/_lib.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-console */ 2 | import build from '../../../builds/fr-compromise.mjs' 3 | import src from '../../../src/index.js' 4 | let nlp = src 5 | if (process.env.TESTENV === 'prod') { 6 | console.warn('== production build test 🚀 ==') 7 | nlp = build 8 | } 9 | 10 | import plg from '../src/plugin.js' 11 | nlp.plugin(plg) 12 | 13 | export default nlp 14 | -------------------------------------------------------------------------------- /plugins/dates/src/toJson.js: -------------------------------------------------------------------------------- 1 | 2 | const toJson = function (arr) { 3 | return arr.map(o => { 4 | let res = { 5 | start: o.start.start().iso() 6 | } 7 | // either explicit or implicit end date 8 | if (o.end) { 9 | res.end = o.end.end().iso() 10 | } else { 11 | res.end = o.start.end().iso() 12 | } 13 | return res 14 | }) 15 | } 16 | export default toJson -------------------------------------------------------------------------------- /src/01-one/tokenize/compute/index.js: -------------------------------------------------------------------------------- 1 | import machine from './machine.js' 2 | 3 | // cheat-method for a quick loop 4 | const termLoop = function (view, fn) { 5 | let docs = view.docs 6 | for (let i = 0; i < docs.length; i += 1) { 7 | for (let t = 0; t < docs[i].length; t += 1) { 8 | fn(docs[i][t], view.world) 9 | } 10 | } 11 | } 12 | export default { 13 | machine: (view) => termLoop(view, machine), 14 | } -------------------------------------------------------------------------------- /learn/adjectives/learn.js: -------------------------------------------------------------------------------- 1 | import data from './data.js' 2 | // import data from '../nouns/data.js' 3 | 4 | import { learn, compress, test } from 'suffix-thumb' 5 | 6 | 7 | const pairs = {} 8 | data.forEach(a => { 9 | let [m, f, mp, fp] = a 10 | pairs[m] = [f, mp, fp] 11 | }) 12 | 13 | console.log(JSON.stringify(pairs, null, 2)) 14 | // let model = learn(pairs) 15 | // model = compress(model) 16 | // console.log(JSON.stringify(model, null, 2)) 17 | // test(pairs) -------------------------------------------------------------------------------- /plugins/dates/src/phrase/normalize.js: -------------------------------------------------------------------------------- 1 | const normalize = function (m) { 2 | m = m.clone() 3 | // remove redundant day-names like 'Wed march 2nd' 4 | if (m.has('#WeekDay') && m.has('#Month') && m.has('#NumericValue')) { 5 | m.remove('#WeekDay') 6 | } 7 | // jusqu'à le quatorze juillet 8 | m.remove('(le|la)') 9 | // quatorze -> 14 10 | m.numbers().toCardinal().toNumber() 11 | // m.compute('index') 12 | return m 13 | } 14 | export default normalize -------------------------------------------------------------------------------- /data/lexicon/numbers/ordinals.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'zeroième', 3 | 'premier', 4 | 'unième', 5 | 'deuxième', 6 | 'troisième', 7 | 'quatrième', 8 | 'cinquième', 9 | 'sixième', 10 | 'septième', 11 | 'huitième', 12 | 'neuvième', 13 | 'dixième', 14 | 'onzième', 15 | 'douzième', 16 | 'treizième', 17 | 'quatorzième', 18 | 'quinzième', 19 | 'seizième', 20 | 'vingtième', 21 | 'trentième', 22 | 'quarantième', 23 | 'cinquantième', 24 | 'soixantième', 25 | ] 26 | -------------------------------------------------------------------------------- /src/03-three/numbers/find.js: -------------------------------------------------------------------------------- 1 | const findNumbers = function (view) { 2 | let m = view.match('#Value+') 3 | 4 | //seventh fifth 5 | if (m.match('#Ordinal #Ordinal').match('#TextValue').found && !m.has('#Multiple')) { 6 | m = m.splitAfter('#Ordinal') 7 | } 8 | 9 | //fifth five 10 | m = m.splitBefore('#Ordinal [#Cardinal]', 0) 11 | //5-8 12 | m = m.splitAfter('#NumberRange') 13 | // june 5th 1999 14 | m = m.splitBefore('#Year') 15 | return m 16 | } 17 | export default findNumbers -------------------------------------------------------------------------------- /src/01-one/lexicon/methods/noun/index.js: -------------------------------------------------------------------------------- 1 | import { convert, reverse } from 'suffix-thumb' 2 | import model from '../model.js' 3 | 4 | let pRev = reverse(model.noun.plural) 5 | const toPlural = (str) => convert(str, model.noun.plural) 6 | const fromPlural = (str) => convert(str, pRev) 7 | 8 | const all = (str) => { 9 | let plr = toPlural(str) 10 | if (str === plr) { 11 | return [str] 12 | } 13 | return [str, plr] 14 | } 15 | export default { 16 | toPlural, 17 | fromPlural, 18 | all 19 | } -------------------------------------------------------------------------------- /plugins/dates/README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | travaux en cours! • work-in-progress! 5 | 6 |
7 | 8 | ```js 9 | import nlp from 'fr-compromise' 10 | import frDatePlugin from 'fr-compromise-dates' 11 | nlp.plugin(frDatePlugin) 12 | 13 | let doc = nlp('entre sept et oct') 14 | doc.dates().json()[0] 15 | /* 16 | { text: 'entre sept et oct', 17 | date: [{ 18 | start: { month: 9, year: 2023 }, 19 | end: { month: 10, year: 2023 } 20 | }] 21 | }*/ 22 | ``` 23 | 24 | MIT -------------------------------------------------------------------------------- /learn/wiktionary/add.js: -------------------------------------------------------------------------------- 1 | import fixes from './fixes.js' 2 | import adj from '../../data/models/adjective/index.js' 3 | 4 | let data = adj 5 | //m: [f, p, fp] 6 | let out = {} 7 | Object.keys(fixes).forEach(k => { 8 | let arr = fixes[k] 9 | if (arr.length === 1) { 10 | // only got a plural 11 | out[k] = [k, arr[0], arr[0]] 12 | } else if (arr.length === 3) { 13 | // only fem plurals 14 | let [m, f, fp] = arr 15 | out[k] = [f, m, fp] 16 | } 17 | }) 18 | data = Object.assign(data, out) 19 | console.log(JSON.stringify(data, null, 2)) -------------------------------------------------------------------------------- /src/02-two/preTagger/model/regex/regex-text.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | // #coolguy 3 | [/^#[a-z0-9_\u00C0-\u00FF]{2,}$/i, 'HashTag'], 4 | 5 | // @spencermountain 6 | [/^@\w{2,}$/, 'AtMention'], 7 | 8 | // period-ones acronyms - f.b.i. 9 | [/^([A-Z]\.){2}[A-Z]?/i, ['Acronym', 'Noun'], 'F.B.I'], //ascii-only 10 | 11 | // ending-apostrophes 12 | [/.{3}[lkmnp]in['‘’‛‵′`´]$/, 'Gerund', "chillin'"], 13 | [/.{4}s['‘’‛‵′`´]$/, 'Possessive', "flanders'"], 14 | 15 | // leading contractions 16 | // [/^s'[a-z]$/, 'Verb'], 17 | // [/^l'[a-z]$/, 'Noun'], 18 | ] 19 | -------------------------------------------------------------------------------- /data/lexicon/numbers/cardinals.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'zero', // - 0 3 | 'un', // - 1 4 | 'deux', // - 2 5 | 'trois', // - 3 6 | 'quatre', // - 4 7 | 'cinq', // - 5 8 | 'six', // - 6 9 | 'sept', // - 7 10 | 'huit', // - 8 11 | 'neuf', // - 9 12 | 13 | 'dix', 14 | 'onze', 15 | 'douze', 16 | 'treize', 17 | 'quatorze', 18 | 'quinze', 19 | 'seize', 20 | 'dix sept', 21 | 'dix huit', 22 | 'dix neuf', 23 | 'vingt', 24 | 'trente', 25 | 'quarante', 26 | 'cinquante', 27 | 'soixante', 28 | // 'quatre vingt', 29 | // 'quatre vingt dix huit', 30 | 31 | ] 32 | -------------------------------------------------------------------------------- /plugins/dates/src/find.js: -------------------------------------------------------------------------------- 1 | const findDates = function (doc) { 2 | let m = doc.match('#Date+') 3 | // 7 jun 2018 4 | m = m.growLeft('#Value+$') 5 | m = m.growRight('^#Value+') 6 | // pendant juin 7 | m = m.growLeft('(le|la)$')// jusqu'a le 8 | m = m.growLeft('(en|entre|depuis|courant|pendant|dans|lorsque|avant|après|à|a|au)$') 9 | m = m.growLeft('au cours de$') 10 | m = m.growLeft('jusque$')// jusqu'en jusqu'à 11 | // sept-et-jun 12 | m = m.growRight('^et (le|la)? #Date+') 13 | 14 | // remove overlaps 15 | m = m.settle() 16 | // m.debug() 17 | return m 18 | } 19 | export default findDates -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/fix-contractions.js: -------------------------------------------------------------------------------- 1 | // better guesses for 'le/la/les' in l'foo 2 | const fixContractions = function (terms, i) { 3 | let term = terms[i] 4 | // let tags = term.tags 5 | if (term.implicit === 'le') { 6 | let nextTerm = terms[i + 1] 7 | if (!nextTerm) { 8 | return null 9 | } 10 | if (nextTerm.tags.has('FemaleNoun')) { 11 | term.implicit = 'la' 12 | } 13 | // support female plural? 14 | if (nextTerm.tags.has('PluralNoun')) { 15 | term.implicit = 'les' 16 | } 17 | } 18 | return null 19 | } 20 | export default fixContractions -------------------------------------------------------------------------------- /add-verbs.js: -------------------------------------------------------------------------------- 1 | import prettyJSON from 'pretty-json-stringify' 2 | 3 | import fs from 'fs' 4 | // parse JSON-newline file 5 | let arr = fs.readFileSync('./more-verbs.jsonl').toString() 6 | .split(/\n/).filter(str => str).map(str => JSON.parse(str)) 7 | 8 | let out = {} 9 | arr.forEach(obj => { 10 | if (obj['Indicatif Futur'][0]) { 11 | let str = obj['Indicatif Futur'] 12 | out[obj.word] = str 13 | } 14 | }) 15 | console.log(prettyJSON(out, { 16 | shouldExpand: (_, level) => level >= 1 ? false : true 17 | })) 18 | 19 | import nlp from './src/index.js' 20 | // console.log(nlp('dépister').verbs().conjugate()) 21 | 22 | -------------------------------------------------------------------------------- /data/lexicon/people/firstnames.js: -------------------------------------------------------------------------------- 1 | //ambiguously-gendered firstnames 2 | //names commonly used in either gender 3 | export default [ 4 | 'alexis', 5 | 'andra', 6 | 'aubrey', 7 | 'blair', 8 | 'casey', 9 | 'cassidy', 10 | 'cheyenne', 11 | 'devan', 12 | 'devon', 13 | 'jamie', 14 | 'jammie', 15 | 'jessie', 16 | 'jude', 17 | 'kasey', 18 | 'kelsey', 19 | 'kenyatta', 20 | 'kerry', 21 | 'kris', 22 | 'lashawn', 23 | 'marion', 24 | 'marlo', 25 | 'mel', 26 | 'morgan', 27 | 'nelly', 28 | 'quinn', 29 | 'regan', 30 | 'rene', 31 | 'shay', 32 | 'shea', 33 | 'shelby', 34 | 'shiloh', 35 | ] 36 | -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/1st-pass/titlecase.js: -------------------------------------------------------------------------------- 1 | const isTitleCase = function (str) { 2 | return /^[A-Z][a-z'\u00C0-\u00FF]/.test(str) || /^[A-Z]$/.test(str) 3 | } 4 | 5 | // add a noun to any non-0 index titlecased word, with no existing tag 6 | const titleCaseNoun = function (terms, i, world) { 7 | let setTag = world.methods.one.setTag 8 | let term = terms[i] 9 | if (i === 0) { 10 | return null 11 | } 12 | if (term.tags.size > 0) { 13 | return null 14 | } 15 | if (isTitleCase(term.text)) { 16 | setTag([term], 'ProperNoun', world, false, 'title-case') 17 | return true 18 | } 19 | return null 20 | } 21 | export default titleCaseNoun -------------------------------------------------------------------------------- /tests/buildNet.test.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from './_lib.js' 3 | let here = '[fr-buildNet] ' 4 | 5 | test('buildNet:', function (t) { 6 | let matches = [ 7 | { match: '{crier/Verb}' }, 8 | { match: '{jaune/Adjective}' }, 9 | { match: '{troupe/Noun}' } 10 | ] 11 | let net = nlp.buildNet(matches) 12 | t.ok(net.hooks.crier, here + 'crier') 13 | t.ok(net.hooks.criaient, here + 'criaient') 14 | t.ok(net.hooks.criaient, here + 'criaient') 15 | t.ok(net.hooks.jaune, here + 'jaune') 16 | t.ok(net.hooks.jaunes, here + 'jaunes') 17 | t.ok(net.hooks.troupe, here + 'troupe') 18 | t.ok(net.hooks.troupes, here + 'troupes') 19 | t.end() 20 | }) -------------------------------------------------------------------------------- /scripts/stress.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-console, no-unused-vars */ 2 | import corpus from 'fr-corpus' //install with `npm i fr-corpus --no-save` 3 | import nlp from '../src/index.js' 4 | let texts = corpus.all() 5 | console.log(`\n\n--- running compromise on ${texts.length.toLocaleString()} random sentences---\n`) 6 | console.log(' --should take a few minutes--') 7 | 8 | for (let i = 0; i < texts.length; i++) { 9 | let txt = texts[i][0] 10 | let doc = nlp(txt) 11 | let m = doc.match('#Determiner #Adverb #Adjective #Noun') 12 | m.forEach(d => { 13 | d.terms() 14 | }) 15 | m.verbs().conjugate() 16 | doc.numbers().add(2) 17 | } 18 | 19 | console.log('\n\n - done!') 20 | -------------------------------------------------------------------------------- /data/lexicon/dates/months.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'janvier', // - January 3 | 'février', // - February 4 | 'mars', // - March 5 | 'avril', // - April 6 | 'mai', // - May 7 | 'juin', // - June 8 | 'juillet', // - July 9 | 'aout', // - August 10 | 'septembre', // -September 11 | 'octobre', // - October 12 | 'novembre', // - November 13 | 'décembre', // - December 14 | 'fevrier', 15 | 'decembre', 16 | 17 | 'janv', 18 | 'jan', 19 | 'fév', 20 | 'fev', 21 | 'févr', 22 | 'fevr', 23 | 'mars', 24 | 'avr', 25 | 'mai', 26 | 'juin', 27 | 'juil', 28 | 'juill', 29 | 'aout', 30 | 'sept', 31 | 'oct', 32 | 'nov', 33 | 'déc', 34 | 'dec', 35 | ] 36 | -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/adj-plurals.js: -------------------------------------------------------------------------------- 1 | // guess a plural/singular tag each Adjective 2 | const adjPlurals = function (terms, i, world) { 3 | let setTag = world.methods.one.setTag 4 | let term = terms[i] 5 | let tags = term.tags 6 | let str = term.implicit || term.normal || term.text || '' 7 | if (tags.has('Adjective')) { 8 | if (str.endsWith('s') || str.endsWith('aux')) { 9 | return setTag([term], 'PluralAdjective', world, false, '3-plural-adj') 10 | } 11 | // if (str.endsWith('euse')) { 12 | // return setTag([term], 'SingularAdjective', world, false, '3-plural-adj') 13 | // } 14 | } 15 | return null 16 | } 17 | export default adjPlurals -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/number-types.js: -------------------------------------------------------------------------------- 1 | // const dateWords = new Set('en', 'entre', 'depuis', 'courant', 'pendant', 'dans', 'lorsque', 'avant', 'après') 2 | 3 | // guess a gender for each noun 4 | const numberTags = function (terms, i, world) { 5 | let setTag = world.methods.one.setTag 6 | let { tags } = terms[i] 7 | // tag some values as a year 8 | if (tags.has('Cardinal') && tags.has('NumericValue')) { 9 | let term = terms[i] 10 | let n = Number(term.text) 11 | if (n && n > 1600 && n < 2090 && n === parseInt(n, 10)) { 12 | return setTag([term], 'Year', world, false, '3-year') 13 | } 14 | } 15 | return null 16 | } 17 | export default numberTags -------------------------------------------------------------------------------- /src/03-three/numbers/parse/_data.js: -------------------------------------------------------------------------------- 1 | import data from '../data.js' 2 | 3 | const toCardinal = {} 4 | const toOrdinal = {} 5 | const toNumber = {} 6 | 7 | Object.keys(data).forEach(k => { 8 | data[k].forEach(a => { 9 | let [num, w, ord] = a 10 | toCardinal[ord] = w 11 | toOrdinal[w] = ord 12 | toNumber[w] = num 13 | // add ordinal without accents 14 | let norm = ord.replace(/è/, 'e') 15 | toNumber[norm] = num 16 | }) 17 | }) 18 | 19 | // add some more 20 | Object.assign(toNumber, { 21 | cents: 100, 22 | milles: 1000, 23 | millions: 1000000, 24 | milliards: 1000000000, 25 | }) 26 | 27 | export { 28 | toOrdinal, 29 | toCardinal, 30 | toNumber 31 | } -------------------------------------------------------------------------------- /src/03-three/verbs/api/adverbs.js: -------------------------------------------------------------------------------- 1 | // split adverbs as before/after the root 2 | const getAdverbs = function (vb, root) { 3 | let res = { 4 | pre: vb.none(), 5 | post: vb.none(), 6 | } 7 | if (!vb.has('#Adverb')) { 8 | return res 9 | } 10 | // pivot on the main verb 11 | let parts = vb.splitOn(root) 12 | if (parts.length === 3) { 13 | return { 14 | pre: parts.eq(0).adverbs(), 15 | post: parts.eq(2).adverbs(), 16 | } 17 | } 18 | // it must be the second one 19 | if (parts.eq(0).isDoc(root)) { 20 | res.post = parts.eq(1).adverbs() 21 | return res 22 | } 23 | res.pre = parts.eq(0).adverbs() 24 | return res 25 | } 26 | export default getAdverbs 27 | -------------------------------------------------------------------------------- /scripts/cleanup.js: -------------------------------------------------------------------------------- 1 | import keep from '../data/lexicon/nouns/nouns.js' 2 | import og from '../data/lexicon/data/neutralNouns.js' 3 | 4 | // import messy from '../data/lexicon/verbs.js' 5 | // const unique = function (arr) { 6 | // let obj = {} 7 | // for (let i = 0; i < arr.length; i += 1) { 8 | // obj[arr[i]] = true 9 | // } 10 | // return Object.keys(obj) 11 | // } 12 | 13 | // console.log(JSON.stringify(unique(messy), null, 2)) 14 | 15 | 16 | let loose = og.filter(str => { 17 | let found = keep.find(s => s === str) 18 | if (found) { 19 | console.log(str) 20 | return false 21 | } 22 | return true 23 | }) 24 | 25 | console.log(og.length) 26 | console.log(loose.length) 27 | // console.log(JSON.stringify(loose, null, 2)) -------------------------------------------------------------------------------- /src/02-two/tagset/tags/values.js: -------------------------------------------------------------------------------- 1 | export default { 2 | Value: { 3 | not: ['Verb', 'Adjective', 'Adverb'], 4 | }, 5 | Ordinal: { 6 | is: 'Value', 7 | not: ['Cardinal'], 8 | }, 9 | Cardinal: { 10 | is: 'Value', 11 | not: ['Ordinal'], 12 | }, 13 | Fraction: { 14 | is: 'Value', 15 | not: ['Noun'], 16 | }, 17 | Multiple: { 18 | is: 'TextValue', 19 | }, 20 | RomanNumeral: { 21 | is: 'Cardinal', 22 | not: ['TextValue'], 23 | }, 24 | TextValue: { 25 | is: 'Value', 26 | not: ['NumericValue'], 27 | }, 28 | NumericValue: { 29 | is: 'Value', 30 | not: ['TextValue'], 31 | }, 32 | Money: { 33 | is: 'Cardinal', 34 | }, 35 | Percent: { 36 | is: 'Value', 37 | }, 38 | } 39 | -------------------------------------------------------------------------------- /learn/wikinews/packSuffixes.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const suff = require('../../src/tagger/data/suffixMap.js') 3 | 4 | // find any long suffixes that are covered by shorter ones 5 | const twos = suff[5] 6 | const twoWords = Object.keys(twos) 7 | let count = 0 8 | 9 | for (let i = 6; i <= 6; i += 1) { 10 | twoWords.forEach((ending) => { 11 | let testWords = Object.keys(suff[i]) 12 | testWords.forEach((w) => { 13 | if (w.endsWith(ending)) { 14 | if (twos[ending] === suff[i][w]) { 15 | count += 1 16 | console.log('kill:', w, `(${ending})`) 17 | delete suff[i][w] 18 | } 19 | } 20 | }) 21 | }) 22 | } 23 | 24 | // console.log(count) 25 | console.log(JSON.stringify(suff, null, 2)) 26 | -------------------------------------------------------------------------------- /learn/giga/french.js: -------------------------------------------------------------------------------- 1 | import { streamXml } from './_giga.js' 2 | const gigaFr = '/Users/spencer/data/opus/fr/giga-fren/xml/fr/giga-fren.release2.fixed.' 3 | 4 | // kick them off 5 | const parseXml = function (id, doBoth) { 6 | const parseFR = function (item) { 7 | try { 8 | doBoth({ fr: item.w || [] }) 9 | return true 10 | } catch (e) { 11 | console.log(e) 12 | } 13 | } 14 | return new Promise((resolve, reject) => { 15 | 16 | const doneMaybe = function () { 17 | console.log('--done-- ') 18 | resolve() 19 | } 20 | 21 | try { 22 | streamXml(gigaFr + `${id}.xml`, parseFR, doneMaybe) 23 | } catch (e) { 24 | console.log(e) 25 | reject(e) 26 | } 27 | }) 28 | } 29 | 30 | export default parseXml -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/noun-gender.js: -------------------------------------------------------------------------------- 1 | // guess a gender for each noun 2 | const nounGender = function (terms, i, world) { 3 | let setTag = world.methods.one.setTag 4 | const guessGender = world.methods.one.guessGender 5 | let { tags } = terms[i] 6 | if (tags.has('Noun') && !tags.has('MaleNoun') && !tags.has('FemaleNoun')) { 7 | let term = terms[i] 8 | // should these have genders? 9 | if (tags.has('ProperNoun') || tags.has('Pronoun') || tags.has('Possessive')) { 10 | return null 11 | } 12 | // look for 'le', look for suffix 13 | let found = guessGender(terms, i) 14 | if (found) { 15 | return setTag([term], found, world, false, '3-noun-gender') 16 | } 17 | } 18 | return null 19 | } 20 | export default nounGender -------------------------------------------------------------------------------- /plugins/dates/rollup.config.js: -------------------------------------------------------------------------------- 1 | import terser from '@rollup/plugin-terser' 2 | import { nodeResolve } from '@rollup/plugin-node-resolve' 3 | 4 | const opts = { keep_classnames: true, module: true } 5 | 6 | export default [ 7 | { 8 | input: 'src/plugin.js', 9 | output: [{ file: 'builds/fr-compromise-dates.cjs', format: 'umd', name: 'frCompromiseDates' }], 10 | plugins: [nodeResolve()], 11 | }, 12 | { 13 | input: 'src/plugin.js', 14 | output: [{ file: 'builds/fr-compromise-dates.min.js', format: 'umd', name: 'frCompromiseDates' }], 15 | plugins: [nodeResolve(), terser(opts)], 16 | }, 17 | { 18 | input: 'src/plugin.js', 19 | output: [{ file: 'builds/fr-compromise-dates.mjs', format: 'esm' }], 20 | plugins: [nodeResolve(), terser(opts)], 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /rollup.config.js: -------------------------------------------------------------------------------- 1 | import terser from '@rollup/plugin-terser' 2 | import { nodeResolve } from '@rollup/plugin-node-resolve' 3 | 4 | const opts = { 5 | keep_classnames: true, 6 | module: true, 7 | } 8 | 9 | export default [ 10 | // === Main == 11 | { 12 | input: 'src/index.js', 13 | output: [{ file: 'builds/fr-compromise.cjs', format: 'umd', name: 'frCompromise' }], 14 | plugins: [nodeResolve()], 15 | }, 16 | { 17 | input: 'src/index.js', 18 | output: [{ file: 'builds/fr-compromise.min.js', format: 'umd', name: 'frCompromise' }], 19 | plugins: [nodeResolve(), terser(opts)], 20 | }, 21 | { 22 | input: 'src/index.js', 23 | output: [{ file: 'builds/fr-compromise.mjs', format: 'esm' }], 24 | plugins: [nodeResolve(), terser(opts)], 25 | } 26 | 27 | ] 28 | -------------------------------------------------------------------------------- /learn/wikinews/parse.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | 3 | let lines = fs 4 | .readFileSync(__dirname + '/wikinews.txt') 5 | .toString() 6 | .split(/\n/) 7 | 8 | // lines = lines.slice(0, 100) 9 | 10 | const mapping = { 11 | NPP: 'N', 12 | NC: 'N', 13 | U: 'N', 14 | ET: 'N', 15 | 16 | VINF: 'V', 17 | VS: 'V', 18 | VPP: 'PastTense', 19 | VPR: 'Gerund', 20 | } 21 | 22 | lines = lines.map((str) => { 23 | let words = str.split(/ /g) 24 | words = words.map((w) => { 25 | let arr = w.split(/_/) 26 | let tag = (arr[1] || '').trim() 27 | tag = mapping[tag] || tag 28 | return { 29 | word: arr[0].trim(), 30 | tag: tag, 31 | } 32 | }) 33 | words = words.filter((w) => w.tag && w.word && w.tag !== 'PONCT') 34 | return words 35 | }) 36 | module.exports = lines 37 | -------------------------------------------------------------------------------- /src/01-one/tokenize/compute/machine.js: -------------------------------------------------------------------------------- 1 | const hasDash = /^\p{Letter}+-\p{Letter}+$/u 2 | // 'machine' is a normalized form that looses human-readability 3 | const doMachine = function (term) { 4 | let str = term.implicit || term.normal || term.text 5 | // remove apostrophes 6 | str = str.replace(/['’]s$/, '') 7 | str = str.replace(/s['’]$/, 's') 8 | //lookin'->looking (make it easier for conjugation) 9 | str = str.replace(/([aeiou][ktrp])in'$/, '$1ing') 10 | //turn re-enactment to reenactment 11 | if (hasDash.test(str)) { 12 | str = str.replace(/-/g, '') 13 | } 14 | // remove accented chars 15 | // str = str.replace(/è/g, 'e') 16 | //#tags, @mentions 17 | str = str.replace(/^[#@]/, '') 18 | if (str !== term.normal) { 19 | term.machine = str 20 | } 21 | } 22 | export default doMachine 23 | -------------------------------------------------------------------------------- /.esformatter: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": [ 3 | "esformatter-quotes", 4 | "esformatter-parseint", 5 | "esformatter-braces", 6 | "esformatter-semicolons" 7 | ], 8 | "quotes": { 9 | "type": "single", 10 | "avoidEscape": false 11 | }, 12 | "whiteSpace": { 13 | "before": { 14 | "ParameterList": -1, 15 | "ParameterComma": -1, 16 | "FunctionDeclarationOpeningBrace": -1, 17 | "FunctionDeclarationClosingBrace": -1, 18 | "ForStatementExpressionOpening": -1 19 | }, 20 | "after": { 21 | "FunctionName": -1, 22 | "ParameterComma": 1, 23 | "FunctionReservedWord": -1, 24 | "ParameterList": -1, 25 | "FunctionDeclarationOpeningBrace": -1, 26 | "PropertyName": -1 27 | } 28 | }, 29 | "lineBreak": { 30 | "before": { 31 | "EndOfFile": 1 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /learn/nouns/learn.js: -------------------------------------------------------------------------------- 1 | const data = require('./data') 2 | // const toFemme = require('../../src/transforms/nouns/toFemme.js') 3 | const toMasc = require('../../src/transforms/nouns/toMasc.js') 4 | const toSigular = require('../../src/transforms/nouns/toSingular.js') 5 | 6 | const toRoot = function (str) { 7 | str = toSigular(str) 8 | str = toMasc(str) 9 | return str 10 | } 11 | 12 | const irregs = {} 13 | let count = 0 14 | data.forEach((a) => { 15 | let from = a[3] 16 | let want = a[0] 17 | let w = toRoot(from) 18 | if (w === want) { 19 | count += 1 20 | } else { 21 | // if (from.endsWith('eur')) { 22 | irregs[from] = want 23 | console.log(from + ' ➔ ' + w + ' (' + want + ')') 24 | // } 25 | } 26 | }) 27 | console.log(count) 28 | console.log(count / data.length) 29 | // console.log(JSON.stringify(irregs, null, 2)) 30 | -------------------------------------------------------------------------------- /plugins/dates/index.d.ts: -------------------------------------------------------------------------------- 1 | import nlp from 'compromise' 2 | type View = ReturnType 3 | 4 | interface DateView extends View { 5 | /** convert parsed dates to a date format */ 6 | format(fmt: string): View 7 | /** get parsed date metadata */ 8 | get(): object[] 9 | } 10 | 11 | interface TimeView extends View { 12 | /** convert parsed dates to a time format */ 13 | format(fmt: string): View 14 | /** get parsed time metadata */ 15 | get(): object[] 16 | } 17 | 18 | export interface DatesMethods { 19 | /** match all date-phrases */ 20 | dates(): DateView 21 | /** match time-of-day phrases */ 22 | times(): TimeView 23 | /** match lengths of time, like '2 weeks' */ 24 | durations(): View 25 | } 26 | 27 | /** extended compromise lib **/ 28 | declare const nlpSpeed: nlp.TypedPlugin 29 | 30 | export default nlpSpeed 31 | -------------------------------------------------------------------------------- /learn/giga/_giga.js: -------------------------------------------------------------------------------- 1 | import XmlStream from 'xml-stream' 2 | import fs from 'fs' 3 | 4 | const streamXml = function (file, cb, end) { 5 | const stream = fs.createReadStream(file) 6 | const xml = new XmlStream(stream) 7 | xml.collect('w') 8 | xml.on('endElement: s', function (item) { 9 | cb(item, xml) 10 | }) 11 | xml.on('end', end) 12 | } 13 | 14 | 15 | const topk = function (arr) { 16 | let obj = {} 17 | arr.forEach(a => { 18 | obj[a] = obj[a] || 0 19 | obj[a] += 1 20 | }) 21 | let res = Object.keys(obj).map(k => [k, obj[k]]) 22 | res = res.sort((a, b) => (a[1] > b[1] ? -1 : 0)) 23 | return res.map(a => a[0]) 24 | } 25 | 26 | async function forEachSync(array, callback) { 27 | for (let i = 0; i < array.length; i++) { 28 | await callback(array[i], i, array) 29 | } 30 | } 31 | 32 | 33 | export { streamXml, forEachSync, topk } -------------------------------------------------------------------------------- /data/lexicon/misc.js: -------------------------------------------------------------------------------- 1 | export default { 2 | n: 'Negative', 3 | ne: 'Negative', 4 | ni: 'Negative', 5 | aucun: 'Negative', 6 | 7 | se: 'Auxiliary', 8 | te: 'Auxiliary', 9 | me: 'Auxiliary', 10 | 11 | ai: 'Auxiliary', 12 | ont: 'Auxiliary', 13 | 14 | // questions 15 | ou: 'Conjunction', 16 | qui: 'Preposition', 17 | que: 'Preposition', 18 | a: 'Preposition', 19 | ces: 'Determiner', 20 | cette: 'Determiner', 21 | 22 | 23 | quelle: 'QuestionWord', 24 | // que: 'QuestionWord', 25 | qu: 'QuestionWord', 26 | quand: 'QuestionWord', 27 | 28 | '&': 'Conjunction', 29 | 30 | si: 'Condition', 31 | sinon: 'Condition', 32 | 'aujourd\'hui': 'Noun', 33 | 34 | 'quelque': 'Adjective', 35 | 'quelques': 'Adjective', 36 | 37 | // alt verbs 38 | 'essaie': 'PresentTense', 39 | 'essaies': 'PresentTense', 40 | 'essaient': 'PresentTense' 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/02-two/tagset/tags/dates.js: -------------------------------------------------------------------------------- 1 | export default { 2 | Date: { 3 | not: ['Verb', 'Adverb', 'Adjective'], 4 | }, 5 | Month: { 6 | is: 'Singular', 7 | also: ['Date'], 8 | not: ['Year', 'WeekDay', 'Time'], 9 | }, 10 | WeekDay: { 11 | is: 'Noun', 12 | also: ['Date'], 13 | }, 14 | Year: { 15 | is: 'Date', 16 | not: ['RomanNumeral'], 17 | }, 18 | FinancialQuarter: { 19 | is: 'Date', 20 | not: 'Fraction', 21 | }, 22 | // 'easter' 23 | Holiday: { 24 | is: 'Date', 25 | also: ['Noun'], 26 | }, 27 | // 'summer' 28 | Season: { 29 | is: 'Date', 30 | }, 31 | Timezone: { 32 | is: 'Noun', 33 | also: ['Date'], 34 | not: ['ProperNoun'], 35 | }, 36 | Time: { 37 | is: 'Date', 38 | not: ['AtMention'], 39 | }, 40 | // 'months' 41 | Duration: { 42 | is: 'Noun', 43 | also: ['Date'], 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /src/02-two/preTagger/model/regex/regex-normal.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | //web tags 3 | [/^[\w.]+@[\w.]+\.[a-z]{2,3}$/, 'Email'], 4 | [/^(https?:\/\/|www\.)+\w+\.[a-z]{2,3}/, 'Url', 'http..'], 5 | [/^[a-z0-9./].+\.(com|net|gov|org|ly|edu|info|biz|dev|ru|jp|de|in|uk|br|io|ai)/, 'Url', '.com'], 6 | 7 | // timezones 8 | [/^[PMCE]ST$/, 'Timezone', 'EST'], 9 | 10 | //names 11 | [/^ma?c'.*/, 'LastName', "mc'neil"], 12 | [/^o'[drlkn].*/, 'LastName', "o'connor"], 13 | [/^ma?cd[aeiou]/, 'LastName', 'mcdonald'], 14 | 15 | //slang things 16 | [/^(lol)+[sz]$/, 'Expression', 'lol'], 17 | [/^wo{2,}a*h?$/, 'Expression', 'wooah'], 18 | [/^(hee?){2,}h?$/, 'Expression', 'hehe'], 19 | [/^(un|de|re)\\-[a-z\u00C0-\u00FF]{2}/, 'Verb', 'un-vite'], 20 | 21 | // m/h 22 | [/^(m|k|cm|km)\/(s|h|hr)$/, 'Unit', '5 k/m'], 23 | // μg/g 24 | [/^(ug|ng|mg)\/(l|m3|ft3)$/, 'Unit', 'ug/L'], 25 | ] 26 | -------------------------------------------------------------------------------- /learn/wolf/parse_wolf.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | var parser = require('xml2json'); 3 | let fs = require('fs'); 4 | 5 | let xml = fs.readFileSync(__dirname + '/wolf-1.0b4.xml', 'utf8'); 6 | // let xml = fs.readFileSync(__dirname + '/tiny.xml', 'utf8'); 7 | // xml to json 8 | // var xml = 'bar'; 9 | var json = JSON.parse(parser.toJson(xml)); 10 | 11 | let words = []; 12 | 13 | let len = json.WN.SYNSET.length; 14 | for (var i = 0; i < len; i++) { 15 | if (json.WN.SYNSET[i].SYNONYM.LITERAL !== '_EMPTY_') { 16 | if (json.WN.SYNSET[i].POS !== 'n') { 17 | continue; 18 | } 19 | let str = json.WN.SYNSET[i].SYNONYM.LITERAL['$t']; 20 | if (str) { 21 | words.push(str); 22 | } else { 23 | json.WN.SYNSET[i].SYNONYM.LITERAL.forEach(function(o) { 24 | words.push(o['$t']); 25 | }); 26 | } 27 | } 28 | } 29 | 30 | console.log(JSON.stringify(words, null, 2)); 31 | -------------------------------------------------------------------------------- /data/lexicon/people/honorifics.js: -------------------------------------------------------------------------------- 1 | //extend to person-names if infront of a name - 'Professor Frink' 2 | export default [ 3 | 'admiral', 4 | 'ayatullah', 5 | 'brigadier', 6 | 'captain', 7 | 'captain', 8 | 'chancellor', 9 | 'colonel', 10 | 'commander', 11 | 'congressman', 12 | 'congresswoman', 13 | 'councillor', 14 | 'count', 15 | 'doctor', 16 | 'dutchess', 17 | 'excellency', 18 | 'field marshal', 19 | 'first lady', 20 | 'first lieutenant', 21 | 'judge', 22 | 'king', 23 | 'lieutenant', 24 | 'magistrate', 25 | 'marshal', 26 | 'mayor', 27 | 'officer', 28 | 'pastor', 29 | 'president', 30 | 'prime minister', 31 | 'prince', 32 | 'princess', 33 | 'professor', 34 | 'queen', 35 | 'rabbi', 36 | 'rear admiral', 37 | 'reverend', 38 | 'second lieutenant', 39 | 'secretary', 40 | 'sergeant', 41 | 'sultan', 42 | 'taoiseach', 43 | 'vice admiral', 44 | ] 45 | -------------------------------------------------------------------------------- /src/03-three/adjectives/api.js: -------------------------------------------------------------------------------- 1 | export const getNth = (doc, n) => (typeof n === 'number' ? doc.eq(n) : doc) 2 | 3 | // get root form of adjective 4 | const getRoot = function (m) { 5 | m.compute('root') 6 | let str = m.text('root') 7 | return str 8 | } 9 | 10 | const api = function (View) { 11 | class Adjectives extends View { 12 | constructor(document, pointer, groups) { 13 | super(document, pointer, groups) 14 | this.viewType = 'Adjectives' 15 | } 16 | conjugate(n) { 17 | const methods = this.methods.two.transform.adjective 18 | return getNth(this, n).map(m => { 19 | let adj = getRoot(m) 20 | return methods.conjugate(adj, methods) 21 | }, []) 22 | } 23 | } 24 | 25 | View.prototype.adjectives = function (n) { 26 | let m = this.match('#Adjective') 27 | m = getNth(m, n) 28 | return new Adjectives(this.document, m.pointer) 29 | } 30 | } 31 | export default api -------------------------------------------------------------------------------- /src/02-two/preTagger/tagRank.js: -------------------------------------------------------------------------------- 1 | const boringTags = new Set(['Auxiliary', 'Possessive']) 2 | 3 | const sortByKids = function (tags, tagSet) { 4 | tags = tags.sort((a, b) => { 5 | // (unknown tags are interesting) 6 | if (boringTags.has(a) || !tagSet.hasOwnProperty(b)) { 7 | return 1 8 | } 9 | if (boringTags.has(b) || !tagSet.hasOwnProperty(a)) { 10 | return -1 11 | } 12 | let kids = tagSet[a].children || [] 13 | let aKids = kids.length 14 | kids = tagSet[b].children || [] 15 | let bKids = kids.length 16 | return aKids - bKids 17 | }) 18 | return tags 19 | } 20 | 21 | const tagRank = function (view) { 22 | const { document, world } = view 23 | const tagSet = world.model.one.tagSet 24 | document.forEach(terms => { 25 | terms.forEach(term => { 26 | let tags = Array.from(term.tags) 27 | term.tagRank = sortByKids(tags, tagSet) 28 | }) 29 | }) 30 | } 31 | export default tagRank 32 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | ### 0.2.8 [Aug 2023] 2 | 3 | - **[fix]** - conjugtion issues 4 | - **[update]** - dependences 5 | 6 | ### 0.2.7 [May 2023] 7 | 8 | - **[fix]** - tagging 9 | - **[new]** - `fr-compromise-dates` 10 | 11 | ### 0.2.6 [Feb 2023] 12 | 13 | - **[fix]** - support multi-lexicon 14 | - **[fix]** - try new suffix thumb 15 | - **[fix]** - conjugation fixes 16 | 17 | ### 0.2.0 [Sept 2022] 18 | 19 | - **[fix]** - inflections+conjugations 20 | - **[new]** - start of verb, noun, and adjective methods 21 | 22 | ### 0.1.2 [August 2022] 23 | 24 | - **[fix]** - inflections+conjugations 25 | 26 | ### 0.1.1 [July 2022] 27 | 28 | - **[fix]** - import format 29 | - **[new]** - typescript types 30 | 31 | ### 0.1.0 [June 2022] 32 | 33 | - **[new]** - `.compute('root')` 34 | - **[new]** - number-parsing 35 | 36 | ### 0.0.2 [June 2022] 37 | 38 | - **[new]** - support root matches 39 | - **[new]** - `.compute('root')` 40 | - **[new]** - FirstPerson, SecondPerson tags etc. 41 | -------------------------------------------------------------------------------- /src/01-one/tokenize/contractions.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | { word: "qu'il", out: ['que', 'il'] }, 3 | { word: "n'y", out: ['ne', 'a'] }, 4 | { word: "n'est", out: ['ne', 'est'] }, 5 | { word: 'aux', out: ['à', 'les'] }, 6 | { word: 'au', out: ['à', 'le'] }, 7 | { before: 'm', out: ['me'] }, 8 | { before: 's', out: ['se'] }, 9 | { before: 't', out: ['tu'] }, 10 | { before: 'n', out: ['ne'] }, 11 | { before: 'qu', out: ['que'] },//tant qu'étudiant 12 | { before: 'puisqu', out: ['puisque'] }, 13 | { before: 'lorsqu', out: ['lorsque'] },//lorsqu’il 14 | { before: 'jusqu', out: ['jusque'] },//jusqu'en 15 | { before: 'quelqu', out: ['quelque'] },//Quelqu'un 16 | 17 | { word: 'auquel', out: ['à', 'lequel'] }, 18 | { word: 'auxquels', out: ['à', 'lesquels'] }, 19 | { word: 'auxquelles', out: ['à', 'lesquelles'] }, 20 | { word: 'duquel', out: ['de', 'lequel'] }, 21 | { word: 'desquels', out: ['de', 'lesquels'] }, 22 | { word: 'desquelles', out: ['de', 'lesquelles'] }, 23 | ] -------------------------------------------------------------------------------- /learn/wiktionary/index.js: -------------------------------------------------------------------------------- 1 | // import wtf from 'wtf_wikipedia' 2 | import rp from 'request-promise'; 3 | import $ from 'cheerio'; 4 | import list from './list.js' 5 | 6 | 7 | const doit = async function (word) { 8 | const url = `https://fr.wiktionary.org/wiki/${encodeURIComponent(word)}`; 9 | return rp(url) 10 | .then(function (html) { 11 | //success! 12 | let all = [] 13 | let r = $('.flextable-fr-mfsp :first a ', html) 14 | r.each(function (i, o) { 15 | let str = $(this).text() 16 | if (!str.match(/^\\/)) { 17 | all.push(str) 18 | } 19 | }) 20 | return all 21 | }) 22 | .catch(function (err) { 23 | console.log('error') 24 | }); 25 | 26 | } 27 | 28 | ; (async () => { 29 | let all = {} 30 | 31 | let keys = Object.keys(list) 32 | for (let i = 0; i < keys.length; i += 1) { 33 | 34 | let w = keys[i] 35 | all[w] = await doit(w) 36 | } 37 | console.log(JSON.stringify(all, null, 2)) 38 | 39 | })() -------------------------------------------------------------------------------- /src/03-three/topics/api.js: -------------------------------------------------------------------------------- 1 | const findPeople = function () { 2 | let m = this.match('#Honorific+? #Person+') 3 | return m 4 | } 5 | 6 | const findOrgs = function () { 7 | return this.match('#Organization+') 8 | } 9 | 10 | const findPlaces = function () { 11 | let m = this.match('(#Place|#Address)+') 12 | 13 | // split all commas except for 'paris, france' 14 | let splits = m.match('@hasComma') 15 | splits = splits.filter(c => { 16 | // split 'europe, china' 17 | if (c.has('(asia|africa|europe|america)$')) { 18 | return true 19 | } 20 | // don't split 'paris, france' 21 | if (c.has('(#City|#Region|#ProperNoun)$') && c.after('^(#Country|#Region)').found) { 22 | return false 23 | } 24 | return true 25 | }) 26 | m = m.splitAfter(splits) 27 | return m 28 | } 29 | 30 | const api = function (View) { 31 | View.prototype.people = findPeople 32 | View.prototype.organizations = findOrgs 33 | View.prototype.places = findPlaces 34 | } 35 | 36 | export default api 37 | -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/noun-plurals.js: -------------------------------------------------------------------------------- 1 | const exceptions = new Set([ 2 | 'bras', 3 | 'bus', 4 | 'corps', 5 | 'discours', 6 | 'fils', 7 | 'héros', 8 | 'os', 9 | 'pays', 10 | 'procès', 11 | 'poids', 12 | 'repas', 13 | 'sens', 14 | 'succès', 15 | ]) 16 | // guess a plural/singular tag each noun 17 | const nounPlurals = function (terms, i, world) { 18 | let setTag = world.methods.one.setTag 19 | let term = terms[i] 20 | let tags = term.tags 21 | let str = term.implicit || term.normal || term.text || '' 22 | if (tags.has('Noun')) { 23 | if (tags.has('Pronoun') || tags.has('ProperNoun') || tags.has('Uncountable') || tags.has('Date')) { 24 | return null 25 | } 26 | if (exceptions.has(str)) { 27 | return setTag([term], 'Singular', world, false, '3-plural-guess') 28 | } 29 | if (str.endsWith('s') && !str.endsWith('is')) { 30 | return setTag([term], 'PluralNoun', world, false, '3-plural-guess') 31 | } 32 | } 33 | return null 34 | } 35 | export default nounPlurals -------------------------------------------------------------------------------- /data/lexicon/misc/prepositions.js: -------------------------------------------------------------------------------- 1 | // these need some work 2 | export default [ 3 | 'lorsque', 4 | 'puisque', 5 | 'lorsqu', 6 | 'puisqu', 7 | 'quoiqu', 8 | 'pourquoi', 9 | 'quelqu', 10 | 'quoique', 11 | 12 | 'y',// -? 13 | 14 | 'de', 'du', 'des', 15 | 'a', 16 | 'd', 17 | 'en', 18 | 'dans', 19 | 'pour', 20 | 'par', 21 | 'sur', 22 | 'avec', 23 | 'apres', 24 | 'selon', 25 | 'depuis', 26 | 'contre', 27 | 'entre', 28 | 'comme', 29 | 'avant', 30 | 'sans', 31 | 'devant', 32 | 'sous', 33 | 'vers', 34 | 'pendant', 35 | 'afin', 36 | 'des', 37 | 'durant', 38 | 'parmi', 39 | 'pres', 40 | 'malgre', 41 | 'chez', 42 | 'aupres', 43 | "jusqu'", 44 | 'concernant', 45 | 'a', 46 | 'à', 47 | 'derriere', 48 | 'hors', 49 | 'outre', 50 | 'envers', 51 | 'sauf', 52 | 'via', 53 | 'jusque', 54 | 'suivant', 55 | 'hormis', 56 | 'environ', 57 | 'par dessus', 58 | 'excepte', 59 | "quelqu'", 60 | 'because', 61 | 'grace', 62 | 'courant', 63 | 'au dessus', 64 | 'voici', 65 | ] 66 | -------------------------------------------------------------------------------- /learn/verbs/old.js: -------------------------------------------------------------------------------- 1 | import verbs from './data.js' 2 | import { learn, test, validate, compress } from 'suffix-thumb' 3 | const hasPipe = /[\|\[]/ 4 | 5 | let index = { 6 | 'je': 0, // "achète", 7 | 'tu': 1, // "achètes", 8 | 'il': 2, // "achète", 9 | 'nous': 3, // "achetons", 10 | 'vous': 4, // "achetez", 11 | 'ils': 5, // "achètent" 12 | } 13 | 14 | const doModel = function (tense, form) { 15 | let pairs = [] 16 | const i = index[form] 17 | Object.keys(verbs).forEach(inf => { 18 | let want = verbs[inf][tense][i] 19 | if (want && !hasPipe.test(want)) { 20 | pairs.push([inf, want]) 21 | } 22 | }) 23 | pairs = validate(pairs) 24 | // test(pairs) 25 | const model = learn(pairs) 26 | return model 27 | } 28 | 29 | 30 | let tense = "Présent" 31 | const models = { 32 | je: doModel(tense, 'je'), 33 | tu: doModel(tense, 'tu'), 34 | il: doModel(tense, 'il'), 35 | nous: doModel(tense, 'nous'), 36 | vous: doModel(tense, 'vous'), 37 | ils: doModel(tense, 'ils'), 38 | } 39 | 40 | // let model = doModel("Présent", 'je') 41 | // model = compress(model) 42 | console.log(JSON.stringify(models, null, 2)) 43 | -------------------------------------------------------------------------------- /data/lexicon/numbers/units.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | '°c', 3 | 'celsius', 4 | '°f', 5 | 'fahrenheit', 6 | 'fahrenheits', 7 | 'kelvin', 8 | 'kelvins', 9 | '°n', 10 | 'm³', 11 | 12 | 'hertz', 13 | 'km/h', 14 | 'byte', 15 | 'bytes', 16 | // 'kb', 17 | 'kilobyte', 18 | 'kilobytes', 19 | // 'mb', 20 | 'megabyte', 21 | 'megabytes', 22 | // 'gb', 23 | 'gigabyte', 24 | 'gigabytes', 25 | // 'tb', 26 | 'terabyte', 27 | 'terabytes', 28 | 'petabyte', 29 | 'petabytes', 30 | 'eb', 31 | 'exabyte', 32 | 'exabytes', 33 | 'zb', 34 | 'zettabyte', 35 | 'zettabytes', 36 | 'yb', 37 | 'yottabyte', 38 | 'yottabytes', 39 | 'joule', 40 | 'joules', 41 | 42 | 'µs', 43 | 44 | 'percent', 45 | 46 | 47 | 'gramme', 48 | 'grammes', 49 | 'kilogramme', 50 | 'kilogrammes', 51 | 'kilo', 52 | 'kilos', 53 | 'litre', 54 | 'litres', 55 | 'millilitre', 56 | 'millilitres', 57 | 'centimètre', 58 | 'centimètres', 59 | 'mètre', 60 | 'mètres', 61 | 'kilomètre', 62 | 'km', 63 | 'kms', 64 | // pied 65 | 'pouce', 66 | 'pouces', 67 | 'mile', 68 | 'miles' 69 | // livre 70 | ] 71 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Spencer Kelly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /plugins/dates/src/phrase/date/data.js: -------------------------------------------------------------------------------- 1 | const months = { 2 | 'janvier': 1, // January 3 | 'février': 2, // February 4 | 'fevrier': 2, // February 5 | 'mars': 3, // March 6 | 'avril': 4, // April 7 | 'mai': 5, // May 8 | 'juin': 6, // June 9 | 'juillet': 7, // July 10 | 'aout': 8, // August 11 | 'septembre': 9, //September 12 | 'octobre': 10, // October 13 | 'novembre': 11, // November 14 | 'décembre': 12, // December 15 | 'decembre': 12, // December 16 | 'jan': 1, 17 | 'fév': 2, 18 | 'fev': 2, 19 | 'mar': 3, 20 | 'avr': 4, 21 | 'aou': 8, 22 | 'sep': 9, 23 | 'sept': 9, //hmm 24 | 'oct': 10, 25 | 'nov': 11, 26 | 'déc': 12, 27 | 'janv': 1, 28 | 'févr': 2, 29 | 'fevr': 2, 30 | 'juil': 7, 31 | 'juill': 7, 32 | } 33 | 34 | const days = { 35 | 'lundi': 1, // Monday 36 | 'mardi': 2, // Tuesday 37 | 'mercredi': 3, // Wednesday 38 | 'jeudi': 4, // Thursday 39 | 'vendredi': 5, // Friday 40 | 'samedi': 6, // Saturday 41 | 'dimanche': 0, // Sunday 42 | 'lun': 1, 43 | 'mar': 2, 44 | 'mer': 3, 45 | 'jeu': 4, 46 | 'ven': 5, 47 | 'sam': 6, 48 | 'dim': 0, 49 | } 50 | export { months, days } -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/2nd-pass/neighbours.js: -------------------------------------------------------------------------------- 1 | const hasBefore = { 2 | la: 'FemaleNoun', 3 | une: 'FemaleNoun', 4 | un: 'MaleNoun', 5 | du: 'MaleNoun', 6 | au: 'MaleNoun', 7 | des: 'PluralNoun', 8 | aux: 'PluralNoun', 9 | de: 'Noun', 10 | // modals 11 | dois: 'Verb', 12 | doit: 'Verb', 13 | devons: 'Verb', 14 | devez: 'Verb', 15 | doivent: 'Verb', 16 | 17 | peux: 'Verb', 18 | peut: 'Verb', 19 | pouvons: 'Verb', 20 | pouvez: 'Verb', 21 | peuvent: 'Verb', 22 | // (conditional) 23 | pouvait: 'Verb', 24 | pourrait: 'Verb', 25 | pourrais: 'Verb', 26 | pourrions: 'Verb', 27 | pourriez: 'Verb', 28 | pourraient: 'Verb', 29 | 30 | // 31 | avoir: 'Noun', 32 | pas: 'Verb' //maybe 33 | } 34 | 35 | const tagNeighbours = function (terms, i, world) { 36 | let setTag = world.methods.one.setTag 37 | if (terms[i - 1]) { 38 | let lastStr = terms[i - 1].normal 39 | if (terms[i].tags.size === 0 && hasBefore.hasOwnProperty(lastStr)) { 40 | setTag([terms[i]], hasBefore[lastStr], world, false, 'neighbour') 41 | return true 42 | } 43 | } 44 | return null 45 | } 46 | export default tagNeighbours -------------------------------------------------------------------------------- /plugins/dates/src/api.js: -------------------------------------------------------------------------------- 1 | import find from './find.js' 2 | import parse from './phrase/index.js' 3 | import spacetime from 'spacetime' 4 | import toJson from './toJson.js' 5 | 6 | export const getNth = (doc, n) => (typeof n === 'number' ? doc.eq(n) : doc) 7 | 8 | 9 | const api = function (View) { 10 | class Dates extends View { 11 | constructor(document, pointer, groups, opts = {}) { 12 | super(document, pointer, groups) 13 | this.viewType = 'Dates' 14 | this.opts = opts || {} 15 | } 16 | parse(n) { 17 | return getNth(this, n).map(m => toJson(parse(m.this.opts))) 18 | } 19 | json(opts, n) { 20 | let m = getNth(this, n) 21 | let arr = m.map(vb => { 22 | let out = vb.toView().json(opts)[0] || {} 23 | let res = parse(vb, this.opts) 24 | out.dates = toJson(res) 25 | return out 26 | }, []) 27 | return arr 28 | } 29 | } 30 | 31 | View.prototype.dates = function (opts = {}) { 32 | opts.today = spacetime(opts.today, opts.timezone) 33 | let m = find(this, opts) 34 | return new Dates(this.document, m.pointer, null, opts) 35 | } 36 | } 37 | export default api -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/1st-pass/regex.js: -------------------------------------------------------------------------------- 1 | const hasApostrophe = /['‘’‛‵′`´]/ 2 | 3 | // normal regexes 4 | const doRegs = function (str, regs) { 5 | for (let i = 0; i < regs.length; i += 1) { 6 | if (regs[i][0].test(str) === true) { 7 | return regs[i] 8 | } 9 | } 10 | return null 11 | } 12 | 13 | const checkRegex = function (terms, i, world) { 14 | let setTag = world.methods.one.setTag 15 | let term = terms[i] 16 | let { regexText, regexNormal, regexNumbers } = world.model.two 17 | let normal = term.machine || term.normal 18 | let text = term.text 19 | // keep dangling apostrophe? 20 | if (hasApostrophe.test(term.post) && !hasApostrophe.test(term.pre)) { 21 | text += term.post.trim() 22 | } 23 | let arr = doRegs(text, regexText) || doRegs(normal, regexNormal) 24 | // hide a bunch of number regexes behind this one 25 | if (!arr && /[0-9]/.test(normal)) { 26 | arr = doRegs(normal, regexNumbers) 27 | } 28 | if (arr) { 29 | setTag([term], arr[1], world, false, `2-regex- '${arr[2] || arr[0]}'`) 30 | term.confidence = 0.6 31 | return true 32 | } 33 | return null 34 | } 35 | export default checkRegex 36 | -------------------------------------------------------------------------------- /src/01-one/lexicon/methods/adjective/index.js: -------------------------------------------------------------------------------- 1 | import { convert, reverse } from 'suffix-thumb' 2 | import model from '../model.js' 3 | 4 | let fRev = reverse(model.adjective.female) 5 | let pRev = reverse(model.adjective.plural) 6 | let fpRev = reverse(model.adjective.femalePlural) 7 | 8 | const toFemale = (str) => convert(str, model.adjective.female) 9 | const toPlural = (str) => convert(str, model.adjective.plural) 10 | const toFemalePlural = (str) => convert(str, model.adjective.femalePlural) 11 | const fromFemale = (str) => convert(str, fRev) 12 | const fromPlural = (str) => convert(str, pRev) 13 | const fromFemalePlural = (str) => convert(str, fpRev) 14 | 15 | const conjugate = function (str) { 16 | return { 17 | male: str, 18 | female: toFemale(str), 19 | plural: toPlural(str), 20 | femalePlural: toFemalePlural(str), 21 | } 22 | } 23 | 24 | const all = (str) => { 25 | let arr = Object.values(conjugate(str)) 26 | return arr.filter(s => s) 27 | } 28 | 29 | export default { 30 | all, 31 | conjugate, 32 | toFemale, 33 | toPlural, 34 | toFemalePlural, 35 | fromFemale, 36 | fromPlural, 37 | fromFemalePlural, 38 | } 39 | // console.log(conjugate('frais')) -------------------------------------------------------------------------------- /data/lexicon/misc/expressions.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'a la', 3 | 'ah', 4 | 'ahem', 5 | 'argh', 6 | 'bah', 7 | 'boo', 8 | 'bye', 9 | 'dammit', 10 | 'damn', 11 | 'damnit', 12 | 'dang', 13 | 'duh', 14 | 'eek', 15 | 'eep', 16 | 'eh', 17 | 'et cetera', 18 | 'eww', 19 | 'fuck', 20 | 'gah', 21 | 'gee', 22 | 'golly', 23 | 'goodbye', 24 | 'grr', 25 | 'haha', 26 | 'hahaha', 27 | 'hai', 28 | 'hee', 29 | 'hell', 30 | 'hello', 31 | 'hey', 32 | 'hi', 33 | 'hmm', 34 | 'holy moly', 35 | 'holy', 36 | 'hurrah', 37 | 'lmao', 38 | 'lmfao', 39 | 'lol', 40 | 'lols', 41 | 'meh', 42 | 'mmm', 43 | 'nah', 44 | 'nope', 45 | 'oh', 46 | 'ohh', 47 | 'ooh', 48 | 'ooo', 49 | 'oops', 50 | 'ow', 51 | 'oy', 52 | 'pff', 53 | 'phew', 54 | 'please', 55 | 'plz', 56 | 'psst', 57 | 'sheesh', 58 | 'shhh', 59 | 'shit', 60 | 'tsk', 61 | 'ugh', 62 | 'uh huh', 63 | 'uh oh', 64 | 'uh', 65 | 'uhh', 66 | 'uhm', 67 | 'voila', 68 | 'whee', 69 | 'whew', 70 | 'whoa', 71 | 'wow', 72 | 'wtaf', 73 | 'wtf', 74 | 'ya', 75 | 'yaa', 76 | 'yahoo', 77 | 'yay', 78 | 'yeah', 79 | 'yuck', 80 | 'yup', 81 | "d'oh", 82 | ] 83 | -------------------------------------------------------------------------------- /.github/workflows/build-and-test.yml: -------------------------------------------------------------------------------- 1 | name: Build and test 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | build-and-test: 7 | runs-on: ${{ matrix.os }} 8 | 9 | strategy: 10 | matrix: 11 | node-version: [14.x, 18.x] 12 | os: [ubuntu-latest, windows-latest] 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: use node.js ${{ matrix.node-version }} 18 | uses: actions/setup-node@v3 19 | with: 20 | node-version: ${{ matrix.node-version }} 21 | 22 | - name: cache dependencies 23 | uses: actions/cache@v3 24 | with: 25 | path: ~/.npm 26 | key: ${{ runner.os }}-npm-${{ matrix.node-version }}-${{ hashFiles('package-lock.json') }} 27 | restore-keys: | 28 | ${{ runner.os }}-npm-${{ matrix.node-version }}- 29 | ${{ runner.os }}-npm- 30 | 31 | - name: install 32 | run: | 33 | npm ci 34 | 35 | - name: static checks 36 | run: | 37 | npm run lint 38 | 39 | - name: build 40 | run: | 41 | npm run build 42 | 43 | - name: test 44 | run: | 45 | npm run test 46 | npm run testb 47 | -------------------------------------------------------------------------------- /learn/giga/getList.js: -------------------------------------------------------------------------------- 1 | import { forEachSync } from './_giga.js' 2 | import doSentences from './french.js' 3 | import fs from 'fs' 4 | 5 | let ids = [] 6 | for (let i = 1; i <= 10; i += 1) { 7 | let str = String(i).padStart(4, '0') 8 | ids.push(str) 9 | } 10 | // ids = ['0004'] 11 | 12 | let list = [] 13 | const tag = 'NOM' 14 | 15 | const doBoth = function (both) { 16 | let terms = both.fr 17 | terms.forEach((term, i) => { 18 | if (i === 0) { 19 | return 20 | } 21 | if (term['$'].pos === tag) { 22 | let last = terms[i - 1]['$text'].toLowerCase() 23 | if (last === 'le' || last === 'un') { 24 | let w = term['$text'] 25 | let inf = term['$'].lem 26 | // console.log(last, w, inf) 27 | if (w && inf) { 28 | w = w.toLowerCase().trim() 29 | inf = inf.toLowerCase().trim() 30 | list.push(inf) 31 | } 32 | } 33 | } 34 | }) 35 | } 36 | 37 | await forEachSync(ids, async id => { 38 | try { 39 | console.log(`\ndoing ${id}:\n`) 40 | await doSentences(id, doBoth) 41 | } catch (e) { 42 | console.log(e) 43 | } 44 | }) 45 | console.log('done') 46 | fs.writeFileSync('./pairs.js', 'export default ' + JSON.stringify(list)) 47 | -------------------------------------------------------------------------------- /src/03-three/verbs/api/parse.js: -------------------------------------------------------------------------------- 1 | import getAdverbs from './adverbs.js' 2 | 3 | const getAuxiliary = function (vb, root) { 4 | let parts = vb.splitBefore(root) 5 | if (parts.length <= 1) { 6 | return vb.none() 7 | } 8 | let aux = parts.eq(0) 9 | aux = aux.not('(#Adverb|#Negative|#Prefix)') 10 | return aux 11 | } 12 | 13 | const getNegative = function (vb) { 14 | return vb.match('#Negative') 15 | } 16 | 17 | // pull-apart phrasal-verb into verb-particle 18 | // const getPhrasal = function (root) { 19 | // let particle = root.match('#Particle$') 20 | // return { 21 | // verb: root.not(particle), 22 | // particle: particle, 23 | // } 24 | // } 25 | 26 | const getRoot = function (view) { 27 | view.compute('root') 28 | let str = view.text('root') 29 | return str 30 | } 31 | 32 | const parseVerb = function (view) { 33 | let vb = view.clone() 34 | // vb.contractions().expand() 35 | const root = getRoot(vb) 36 | let res = { 37 | root: root, 38 | prefix: vb.match('#Prefix'), 39 | adverbs: getAdverbs(vb, root), 40 | auxiliary: getAuxiliary(vb, root), 41 | negative: getNegative(vb), 42 | // phrasal: getPhrasal(root), 43 | } 44 | return res 45 | } 46 | export default parseVerb 47 | -------------------------------------------------------------------------------- /learn/verbs/toPairs.js: -------------------------------------------------------------------------------- 1 | import verbs from './data.js' 2 | import scraped from '../scrape/result.js' 3 | 4 | import { learn, test, validate, compress } from 'suffix-thumb' 5 | const hasPipe = /[\|\[]/ 6 | 7 | let index = { 8 | 'je': 0, // "achète", 9 | 'tu': 1, // "achètes", 10 | 'il': 2, // "achète", 11 | 'nous': 3, // "achetons", 12 | 'vous': 4, // "achetez", 13 | 'ils': 5, // "achètent" 14 | } 15 | 16 | const getPairs = function (tense) { 17 | let byWord = {} 18 | Object.keys(verbs).forEach(inf => { 19 | let words = verbs[inf][tense] || [] 20 | if (words.some(str => str === '' || str.length === 1)) { 21 | return 22 | } 23 | byWord[inf] = verbs[inf][tense] 24 | }) 25 | return byWord 26 | } 27 | 28 | 29 | 30 | const res = getPairs("Imparfait") 31 | Object.keys(scraped).forEach(inf => { 32 | if (res[inf]) { 33 | return 34 | } 35 | let vals = Object.values(scraped[inf]["Imperfect"]) 36 | if (vals.length < 5 || vals.some(str => str === '' || str.length === 1 || str === 'le')) { 37 | return 38 | } 39 | res[inf] = vals 40 | }) 41 | 42 | // let model = doModel("Présent", 'je') 43 | // model = compress(model) 44 | console.log(JSON.stringify(res, null, 2)) 45 | console.log(Object.keys(res).length) -------------------------------------------------------------------------------- /src/03-three/verbs/api/toJSON.js: -------------------------------------------------------------------------------- 1 | import parseVerb from './parse.js' 2 | // import getGrammar from './parse/grammar/index.js' 3 | // import { getTense } from './lib.js' 4 | 5 | const toArray = function (m) { 6 | if (!m || !m.isView) { 7 | return [] 8 | } 9 | const opts = { normal: true, terms: false, text: false } 10 | return m.json(opts).map(s => s.normal) 11 | } 12 | 13 | const toText = function (m) { 14 | if (!m || !m.isView) { 15 | return '' 16 | } 17 | return m.text('normal') 18 | } 19 | 20 | // const toInfinitive = function (root) { 21 | // const { verbToInfinitive } = root.methods.two.transform 22 | // let str = root.text('normal') 23 | // return verbToInfinitive(str, root.model, getTense(root)) 24 | // } 25 | 26 | const toJSON = function (vb) { 27 | let parsed = parseVerb(vb) 28 | vb = vb.clone().toView() 29 | // const info = getGrammar(vb, parsed) 30 | return { 31 | root: parsed.root, 32 | preAdverbs: toArray(parsed.adverbs.pre), 33 | postAdverbs: toArray(parsed.adverbs.post), 34 | auxiliary: toText(parsed.auxiliary), 35 | negative: parsed.negative.found, 36 | prefix: toText(parsed.prefix), 37 | infinitive: parsed.root, 38 | // grammar: info, 39 | } 40 | } 41 | export default toJSON 42 | -------------------------------------------------------------------------------- /data/lexicon/places/places.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | //some of the busiest airports in the world from 3 | //https://www.world-airport-codes.com/world-top-30-airports.html 4 | 'ams', 5 | 'atl', 6 | 'bcn', 7 | 'bkk', 8 | 'cdg', 9 | 'cgk', 10 | 'clt', 11 | 'den', 12 | 'dfw', 13 | 'dxb', 14 | 'fco', 15 | 'fra', 16 | 'hkg', 17 | 'hnd', 18 | 'iax', 19 | 'icn', 20 | 'ist', 21 | 'jfk', 22 | 'kul', 23 | 'las', 24 | 'lax', 25 | 'lgw', 26 | 'lhr', 27 | 'mco', 28 | 'muc', 29 | 'ord', 30 | 'pek', 31 | 'phl', 32 | 'phx', 33 | 'sfo', 34 | 'syd', 35 | 'yyz', 36 | 37 | 'antarctic ocean', 38 | 'arctic ocean', 39 | 'atlantic ocean', 40 | 'everglades', 41 | 'great britain', 42 | 'great lakes', 43 | 'indian ocean', 44 | 'new england', 45 | 'pacific ocean', 46 | 47 | //continents 48 | 'africa', 49 | 'europe', 50 | 'americas', 51 | 'asia', 52 | 53 | //some notable neighbourhoods (just #Place) 54 | 'midtown', 55 | 'downtown', 56 | 'uptown', 57 | 'the bronx', 58 | 'brooklyn', 59 | 'manhattan', 60 | 'greenwich', 61 | 'soho', 62 | 'harlem', 63 | 'chinatown', 64 | 'the hamptons', 65 | 'beverly hills', 66 | 'bel air', 67 | 'malibu', 68 | 'gay village', 69 | 'sunderland', 70 | ] 71 | -------------------------------------------------------------------------------- /data/models/index.js: -------------------------------------------------------------------------------- 1 | import noun from './noun/plurals.js' 2 | import adjective from './adjective/index.js' 3 | 4 | import futureTense from './verb/future-tense.js' 5 | import imperfect from './verb/imperfect.js' 6 | import pastParticiple from './verb/past-participle.js' 7 | import presentTense from './verb/present-tense.js' 8 | 9 | const vbOrder = ['je', 'tu', 'il', 'nous', 'vous', 'ils'] 10 | const nOrder = ['plural'] 11 | const adjOrder = ['female', 'plural', 'femalePlural'] 12 | const todo = { 13 | noun: { data: noun, keys: nOrder }, 14 | adjective: { data: adjective, keys: adjOrder }, 15 | futureTense: { data: futureTense, keys: vbOrder }, 16 | imperfect: { data: imperfect, keys: vbOrder }, 17 | pastParticiple: { data: pastParticiple, keys: ['prt'] }, 18 | presentTense: { data: presentTense, keys: vbOrder }, 19 | } 20 | 21 | // turn our conjugation data into word-pairs 22 | let model = {} 23 | Object.keys(todo).forEach(k => { 24 | model[k] = {} 25 | let { data, keys } = todo[k] 26 | keys.forEach((form, i) => { 27 | let pairs = [] 28 | Object.keys(data).forEach(inf => { 29 | pairs.push([inf, data[inf][i]]) 30 | }) 31 | model[k][form] = pairs 32 | // console.log(k, form, pairs.length) 33 | }) 34 | }) 35 | 36 | export default model 37 | -------------------------------------------------------------------------------- /data/lexicon/people/people.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | //famous people with names that are hard to recognize independendtly 3 | //male 4 | 'hitler', 5 | 'ronaldo', 6 | 'ashton kutcher', 7 | 'barack obama', 8 | 'cardinal wolsey', 9 | 'carson palmer', 10 | 'denzel washington', 11 | 'dick wolf', 12 | 'emeril lagasse', 13 | 'hulk hogan', 14 | 'kanye west', 15 | 'kiefer sutherland', 16 | 'kobe bryant', 17 | 'lebron james', 18 | 'messiaen', 19 | 'mitt romney', 20 | 'mubarek', 21 | 'ray romano', 22 | 'rod stewart', 23 | 'ronaldinho', 24 | 'rush limbaugh', 25 | 'saddam hussain', 26 | 'slobodan milosevic', 27 | 'tiger woods', 28 | 'valentino rossi', 29 | 'van gogh', 30 | 31 | //female 32 | 'halle berry', 33 | 'jk rowling', 34 | 'oprah winfrey', 35 | 'paris hilton', 36 | 'reese witherspoon', 37 | 'scarlett johansson', 38 | 'theresa may', 39 | 'tyra banks', 40 | 'virgin mary', 41 | 42 | //sometimes firstname, sometimes lastname 43 | 'brock', 44 | 'carson', 45 | 'clinton', 46 | 'cruz', 47 | 'dalton', 48 | 'dante', 49 | 'effie', 50 | 'ezekiel', 51 | 'gaston', 52 | 'inez', 53 | 'jaime', 54 | 'jefferson', 55 | 'lee', 56 | 'nettie', 57 | 'ora', 58 | 'palmer', 59 | 'piper', 60 | 'sung', 61 | ] 62 | -------------------------------------------------------------------------------- /learn/wikinews/getLexicon.js: -------------------------------------------------------------------------------- 1 | let lines = require('./parse') 2 | // lines = lines.slice(0, 300) 3 | 4 | let tags = {} 5 | lines.forEach((s) => { 6 | s.forEach((w) => { 7 | tags[w.tag] = tags[w.tag] || {} 8 | let word = w.word.toLowerCase() 9 | tags[w.tag][word] = tags[w.tag][word] || 0 10 | tags[w.tag][word] += 1 11 | }) 12 | }) 13 | 14 | // 'P+D': 241, 15 | // ADJ: 719, 16 | // ADV: 311, 17 | // CC: 172, 18 | // CLO: 32, 19 | // CLR: 53, 20 | // CLS: 88, 21 | // CS: 90, 22 | // DET: 1353, 23 | // ET: 136, 24 | 25 | // nouns: 26 | // NC: 1877, 27 | // NPP: 493, 28 | // P: 1242, 29 | // PREF: 8, 30 | 31 | // PRO: 43, //pronoun 32 | // PROREL: 89, //relative pronoun 33 | // U: 100, 34 | 35 | // V: 509, 36 | // VINF: 140, 37 | // VPP: 402, //PastTense 38 | // VPR: 61, //Gerund 39 | // VS: 10, //presentTense 40 | 41 | // VPP: 'PastTense', 42 | // VPR: 'Gerund', 43 | // VS: 'V', 44 | 45 | const top = function (obj) { 46 | let keys = Object.keys(obj).sort((a, b) => { 47 | if (obj[a] > obj[b]) { 48 | return -1 49 | } else if (obj[a] < obj[b]) { 50 | return 1 51 | } 52 | return 0 53 | }) 54 | let arr = keys.filter((k) => { 55 | return obj[k] > 1 56 | }) 57 | return arr 58 | } 59 | 60 | console.log(JSON.stringify(top(tags['ADJ']), null, 2)) 61 | -------------------------------------------------------------------------------- /learn/verbs/single-pairs.js: -------------------------------------------------------------------------------- 1 | import verbs from './data.js' 2 | import scraped from '../scrape/result.js' 3 | 4 | import { learn, test, validate, compress } from 'suffix-thumb' 5 | const hasPipe = /[\|\[]/ 6 | 7 | let index = { 8 | 'je': 0, // "achète", 9 | 'tu': 1, // "achètes", 10 | 'il': 2, // "achète", 11 | 'nous': 3, // "achetons", 12 | 'vous': 4, // "achetez", 13 | 'ils': 5, // "achètent" 14 | } 15 | 16 | const getPairs = function (tense) { 17 | let byWord = {} 18 | Object.keys(verbs).forEach(inf => { 19 | let words = verbs[inf][tense] || [] 20 | if (words.length === 0 || words.some(str => str === '' || str.length === 1)) { 21 | return 22 | } 23 | byWord[inf] = words[0] 24 | }) 25 | return byWord 26 | } 27 | 28 | 29 | 30 | const res = getPairs("Participe Passé") 31 | Object.keys(scraped).forEach(inf => { 32 | if (res[inf]) { 33 | return 34 | } 35 | let vals = Object.values(scraped[inf]["Present Perfect"]) 36 | if (vals.length < 5 || vals.some(str => str === '' || str.length === 1 || str === 'le')) { 37 | return 38 | } 39 | res[inf] = vals[0].replace(/^(a|ai) /, '') 40 | }) 41 | 42 | // let model = doModel("Présent", 'je') 43 | // model = compress(model) 44 | console.log(JSON.stringify(res, null, 2)) 45 | console.log(Object.keys(res).length) -------------------------------------------------------------------------------- /src/03-three/verbs/api/find.js: -------------------------------------------------------------------------------- 1 | const findVerbs = function (doc) { 2 | let m = doc.match('') 3 | 4 | m = m.splitAfter('@hasComma') 5 | 6 | // the reason he will is ... 7 | // all i do is talk 8 | m = m.splitAfter('[(do|did|am|was|is|will)] (is|was)', 0) 9 | // m = m.splitAfter('[(do|did|am|was|is|will)] #PresentTense', 0) 10 | 11 | // cool 12 | 13 | // like being pampered 14 | m = m.splitBefore('(#Verb && !#Copula) [being] #Verb', 0) 15 | // like to be pampered 16 | m = m.splitBefore('#Verb [to be] #Verb', 0) 17 | 18 | // implicit conjugation - 'help fix' 19 | 20 | m = m.splitAfter('[help] #PresentTense', 0) 21 | // what i can sell is.. 22 | m = m.splitBefore('(#PresentTense|#PastTense) [#Copula]$', 0) 23 | // what i can sell will be 24 | m = m.splitBefore('(#PresentTense|#PastTense) [will be]$', 0) 25 | 26 | // professes love 27 | let toVerbs = m.match('(#PresentTense|#PastTense) #Infinitive') 28 | if (toVerbs.found && !toVerbs.has('^go')) { 29 | m = m.splitBefore('(#PresentTense|#PastTense) [#Infinitive]', 0) 30 | } 31 | // 'allow yourself' 32 | m = m.not('#Reflexive$') 33 | //ensure there's actually a verb 34 | m = m.if('#Verb') 35 | // the reason he will is ... 36 | // ensure it's not two verbs 37 | return m 38 | } 39 | export default findVerbs 40 | -------------------------------------------------------------------------------- /learn/verbs/learn.js: -------------------------------------------------------------------------------- 1 | let verbs = require('./data') 2 | 3 | let pairs = [] 4 | Object.keys(verbs).forEach((inf) => { 5 | let want = verbs[inf]['Présent'][0] 6 | if (want) { 7 | pairs.push([inf, want]) 8 | } 9 | }) 10 | 11 | // order matters 12 | const regs = [ 13 | [/ébrer$/, 'èbre'], 14 | [/eter$/, 'ette'], 15 | [/er$/, 'e'], 16 | 17 | [/dre$/, 'ds'], 18 | [/ure$/, 'us'], 19 | [/ure$/, 'us'], 20 | [/tre$/, 's'], 21 | [/ire$/, 'is'], 22 | [/ore$/, 'os'], 23 | [/cre$/, 'cs'], 24 | 25 | [/llir$/, 'lle'], 26 | [/voir$/, 'vois'], 27 | [/tir$/, 's'], 28 | [/ir$/, 's'], 29 | ] 30 | 31 | const toJe = function (str) { 32 | // try each replacement 33 | for (let i = 0; i < regs.length; i += 1) { 34 | let reg = regs[i][0] 35 | if (str.match(reg)) { 36 | str = str.replace(reg, regs[i][1]) 37 | // for some reason, this seems to happen 38 | str = str.replace(/î/, 'i') 39 | return str 40 | } 41 | } 42 | // otherwise... 43 | str += 's' 44 | return str 45 | } 46 | 47 | let count = 0 48 | pairs.forEach((a) => { 49 | let je = toJe(a[0]) 50 | if (je === a[1]) { 51 | count += 1 52 | } else { 53 | if (a[0].endsWith('oir')) { 54 | console.log(`${a[0]} ~${je}~ want:(${a[1]})`) 55 | } 56 | } 57 | }) 58 | 59 | console.log(count / pairs.length) 60 | -------------------------------------------------------------------------------- /src/03-three/numbers/format/index.js: -------------------------------------------------------------------------------- 1 | import toText from './toText.js' 2 | import { toOrdinal } from '../parse/_data.js' 3 | 4 | const makeSuffix = function (obj) { 5 | return { 6 | prefix: obj.prefix || '', 7 | suffix: obj.suffix || '', 8 | } 9 | } 10 | 11 | const formatNumber = function (parsed, fmt) { 12 | let { prefix, suffix } = makeSuffix(parsed) 13 | if (fmt === 'TextOrdinal') { 14 | let words = toText(parsed.num) 15 | let last = words[words.length - 1] 16 | words[words.length - 1] = toOrdinal[last] 17 | let num = words.join(' ') 18 | return `${prefix}${num}${suffix}` 19 | } 20 | if (fmt === 'TextCardinal') { 21 | let num = toText(parsed.num).join(' ') 22 | return `${prefix}${num}${suffix}` 23 | } 24 | // numeric formats 25 | // '55e' 26 | if (fmt === 'Ordinal') { 27 | let str = String(parsed.num) 28 | let last = str.slice(str.length - 1, str.length) 29 | if (last === '1') { 30 | let num = str + 'er' 31 | return `${prefix}${num}${suffix}` 32 | } 33 | let num = str + 'e' 34 | return `${prefix}${num}${suffix}` 35 | } 36 | if (fmt === 'Cardinal') { 37 | let num = String(parsed.num) 38 | return `${prefix}${num}${suffix}` 39 | } 40 | let num = String(parsed.num || '') 41 | return `${prefix}${num}${suffix}` 42 | } 43 | export default formatNumber -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/2nd-pass/suffix-lookup.js: -------------------------------------------------------------------------------- 1 | 2 | //sweep-through all suffixes 3 | const suffixLoop = function (str = '', suffixes = []) { 4 | const len = str.length 5 | let max = 7 6 | if (len <= max) { 7 | max = len - 1 8 | } 9 | for (let i = max; i > 1; i -= 1) { 10 | let suffix = str.substr(len - i, len) 11 | if (suffixes[suffix.length].hasOwnProperty(suffix) === true) { 12 | // console.log(suffix) 13 | let tag = suffixes[suffix.length][suffix] 14 | return tag 15 | } 16 | } 17 | return null 18 | } 19 | 20 | // decide tag from the ending of the word 21 | const suffixCheck = function (terms, i, world) { 22 | let setTag = world.methods.one.setTag 23 | let suffixes = world.model.two.suffixPatterns 24 | let term = terms[i] 25 | if (term.tags.size === 0) { 26 | let tag = suffixLoop(term.normal, suffixes) 27 | if (tag !== null) { 28 | setTag([term], tag, world, false, '2-suffix') 29 | term.confidence = 0.7 30 | return true 31 | } 32 | // try implicit form of word, too 33 | if (term.implicit) { 34 | tag = suffixLoop(term.implicit, suffixes) 35 | if (tag !== null) { 36 | setTag([term], tag, world, false, '2-implicit-suffix') 37 | term.confidence = 0.7 38 | return true 39 | } 40 | } 41 | } 42 | return null 43 | } 44 | export default suffixCheck 45 | -------------------------------------------------------------------------------- /plugins/dates/scratch.js: -------------------------------------------------------------------------------- 1 | import nlp from '../../src/index.js' 2 | import plg from './src/plugin.js' 3 | nlp.plugin(plg) 4 | // nlp.verbose(true) 5 | let arr = [ 6 | `Je peux emprunter votre voiture entre le 2 mai et le 14 juillet`, 7 | `Je peux emprunter votre voiture jusqu'au quatorze juillet`, 8 | 'entre sept et oct', 9 | `jusqu'en juin`, 10 | `jusqu'à juin`, 11 | `jusqu'à le quatorze juillet`, 12 | 'decembre 25, 2012', 13 | 'Juin 5, 2012', 14 | 'hier après-midi', 15 | '14h30 demain', 16 | 'hier après-midi', 17 | 'aujourd\'hui', 18 | 'hier soir', 19 | `Novembre 3, 2021`, 20 | // 'Novembre 3, 2021', 21 | // '12/01/2018', 22 | // '13/01/2018', 23 | // '5/2/2020', 24 | `le quatorze juillet.`, 25 | 'Mercredi 11 mars', 26 | `Le 6 avril`, 27 | `Il n'y a pas d'augmentation prévue jusqu'en 2032`, 28 | `le 3 novembre 2012`, 29 | 'je suis né le 2 septembre 1982', 30 | 'rendez-vous avant vendredi', 31 | `je t'appellerai jusqu'en septembre`, 32 | `15/12/2020`, 33 | `2020-10-02T07:10:12`, 34 | `juin 2e`, 35 | `2021-02-12`, 36 | `je suis né en juin`, 37 | `ta voiture jusqu’à lundi prochain`, 38 | `entre sept et oct`, 39 | ] 40 | let doc = nlp(arr[0]).debug() 41 | 42 | // let m = doc.match('[#Value] [#Month]') 43 | // m.debug() 44 | // m.groups().date.debug() 45 | // m.groups().month.debug() 46 | 47 | let json = doc.dates({ timezone: 'UTC', today: '2023-03-02' }).json({ terms: false }) 48 | console.dir(json, { depth: 5 }) -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "extends": [ 4 | "eslint:recommended", 5 | "plugin:regexp/recommended" 6 | ], 7 | "ignorePatterns": [ 8 | "builds/*", 9 | "learn/**", 10 | "scripts/**", 11 | "plugins/dates/**" 12 | ], 13 | "env": { 14 | "es6": true, 15 | "browser": true, 16 | "node": true 17 | }, 18 | "parserOptions": { 19 | "ecmaVersion": "latest", 20 | "sourceType": "module" 21 | }, 22 | "rules": { 23 | "comma-dangle": [ 24 | 1, 25 | "only-multiline" 26 | ], 27 | "quotes": [ 28 | 0, 29 | "single", 30 | "avoid-escape" 31 | ], 32 | "max-nested-callbacks": [ 33 | 1, 34 | 4 35 | ], 36 | "max-params": [ 37 | 1, 38 | 5 39 | ], 40 | "consistent-return": 1, 41 | "no-bitwise": 1, 42 | "no-empty": 1, 43 | "no-console": 1, 44 | "no-duplicate-imports": 1, 45 | "no-eval": 2, 46 | "no-implied-eval": 2, 47 | "no-mixed-operators": 2, 48 | "no-multi-assign": 2, 49 | "no-nested-ternary": 1, 50 | "no-prototype-builtins": 0, 51 | "no-self-compare": 1, 52 | "no-sequences": 1, 53 | "no-shadow": 2, 54 | "no-unmodified-loop-condition": 1, 55 | "no-use-before-define": 1, 56 | "prefer-const": 0, 57 | "radix": 1, 58 | "no-unused-vars": 1, 59 | "regexp/prefer-d": 0, 60 | "regexp/prefer-w": 0, 61 | "regexp/prefer-range": 0, 62 | "regexp/no-unused-capturing-group": 0 63 | } 64 | } -------------------------------------------------------------------------------- /data/lexicon/misc/currencies.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | '¢', 3 | '$', 4 | '£', 5 | '¥', 6 | '฿', 7 | '₡', 8 | '€', 9 | '₭', 10 | '₨', 11 | '﷼', 12 | 'aud', 13 | 'baht', 14 | 'bitcoin', 15 | 'bitcoins', 16 | 'cad', 17 | 'cent', 18 | 'cents', 19 | 'cny', 20 | 'denar', 21 | 'denars', 22 | 'dime', 23 | 'dimes', 24 | 'dinar', 25 | 'dinars', 26 | 'dirham', 27 | 'dirhams', 28 | 'dkk', 29 | 'dobra', 30 | 'dobras', 31 | 'dollar', 32 | 'dollars', 33 | 'eur', 34 | 'euro', 35 | 'euros', 36 | 'forint', 37 | 'forints', 38 | 'franc', 39 | 'francs', 40 | 'gbp', 41 | 'hkd', 42 | 'inr', 43 | 'jpy', 44 | 'kn', 45 | 'kr', 46 | 'nis', 47 | 'krona', 48 | 'kronas', 49 | 'krw', 50 | 'kwanza', 51 | 'kwanzas', 52 | 'kyat', 53 | 'kyats', 54 | 'lei', 55 | 'lempira', 56 | 'lempiras', 57 | 'lira', 58 | 'liras', 59 | 'pence', 60 | 'pences', 61 | 'pennies', 62 | 'penny', 63 | 'peso', 64 | 'pesos', 65 | 'pound sterling', 66 | 'pound sterlings', 67 | 'pound', 68 | 'pounds', 69 | 'riel', 70 | 'rouble', 71 | 'roubles', 72 | 'rp', 73 | 'rupee', 74 | 'rupees', 75 | 'shekel', 76 | 'shekels', 77 | 'sheqel', 78 | 'sheqels', 79 | 'shilling', 80 | 'shillings', 81 | 'sterling', 82 | 'sterlings', 83 | 'usd', 84 | 'xaf', 85 | 'xof', 86 | 'yen', 87 | 'yuan', 88 | 'yuans', 89 | 'zł', 90 | 'zloty', 91 | 'zlotys', 92 | 'ден', 93 | 'лв', 94 | 'руб', 95 | ] 96 | -------------------------------------------------------------------------------- /src/03-three/numbers/data.js: -------------------------------------------------------------------------------- 1 | export default { 2 | 3 | ones: [ 4 | [0, 'zero', 'zeroième'], 5 | [1, 'un', 'unième'], 6 | [2, 'deux', 'deuxième'], 7 | [3, 'trois', 'troisième'], 8 | [4, 'quatre', 'quatrième'], 9 | [5, 'cinq', 'cinquième'], 10 | [6, 'six', 'sixième'], 11 | [7, 'sept', 'septième'], 12 | [8, 'huit', 'huitième'], 13 | [9, 'neuf', 'neuvième'], 14 | [10, 'dix', 'dixième'], 15 | [11, 'onze', 'onzième'], 16 | [12, 'douze', 'douzième'], 17 | [13, 'treize', 'treizième'], 18 | [14, 'quatorze', 'quatorzième'], 19 | [15, 'quinze', 'quinzième'], 20 | [16, 'seize', 'seizième'], 21 | [17, 'dix sept', 'dix septième'], 22 | [18, 'dix huit', 'dix huitième'], 23 | [19, 'dix neuf', 'dix neuvième'], 24 | ], 25 | tens: [ 26 | [20, 'vingt', 'vingtième'], 27 | [30, 'trente', 'trentième'], 28 | [40, 'quarante', 'quarantième'], 29 | [50, 'cinquante', 'cinquantième'], 30 | [60, 'soixante', 'soixantième'], 31 | [70, 'soixante dix', 'soixante dixième'], 32 | [80, 'quatre vingt', 'quatre vingtième'], 33 | [90, 'quatre vingt dix', 'quatre vingt dixième'], 34 | ], 35 | multiples: [ 36 | [100, 'cent', 'centième'], 37 | [1000, 'mille', 'millième'], 38 | [1000000, 'million', 'millionième'],//million 1000,000 39 | [1000000000, 'milliard', 'milliardième'],//billion 1000,000,000 40 | // [1000000000000, 'mille milliards', 'mille milliardième'],//trillion 1000,000,000 41 | ] 42 | 43 | } -------------------------------------------------------------------------------- /plugins/dates/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "fr-compromise-dates", 3 | "description": "plugin for fr-compromise", 4 | "version": "0.0.2", 5 | "author": "Spencer Kelly (http://spencermounta.in)", 6 | "main": "./src/plugin.js", 7 | "unpkg": "./builds/fr-compromise-dates.min.js", 8 | "module": "./builds/fr-compromise-dates.mjs", 9 | "type": "module", 10 | "sideEffects": false, 11 | "types": "./index.d.ts", 12 | "exports": { 13 | ".": { 14 | "import": "./src/plugin.js", 15 | "require": "./builds/fr-compromise-dates.cjs", 16 | "types": "./index.d.ts" 17 | } 18 | }, 19 | "repository": { 20 | "type": "git", 21 | "url": "git://github.com/nlp-compromise/fr-compromise.git" 22 | }, 23 | "homepage": "https://github.com/nlp-compromise/fr-compromise/tree/master/plugins/dates", 24 | "scripts": { 25 | "test": "tape \"./tests/**/*.test.js\" | tap-dancer --color always", 26 | "testb": "cross-env TESTENV=prod tape \"./tests/**/*.test.js\" | tap-dancer --color always", 27 | "watch": "amble ./scratch.js", 28 | "perf": "node ./scripts/perf.js", 29 | "build": "rollup -c --silent" 30 | }, 31 | "files": [ 32 | "builds/", 33 | "src/", 34 | "index.d.ts" 35 | ], 36 | "eslintIgnore": [ 37 | "builds/*.js" 38 | ], 39 | "peerDependencies": { 40 | "fr-compromise": ">=0.2.0" 41 | }, 42 | "dependencies": { 43 | "spacetime": "7.4.3", 44 | "spacetime-holiday": "0.3.0" 45 | }, 46 | "license": "MIT" 47 | } -------------------------------------------------------------------------------- /src/03-three/contractions/api.js: -------------------------------------------------------------------------------- 1 | const titleCase = /^\p{Lu}[\p{Ll}'’]/u //upercase, then lowercase 2 | // import contract from './contract.js' 3 | 4 | const toTitleCase = function (str = '') { 5 | str = str.replace(/^ *[a-z\u00C0-\u00FF]/, x => x.toUpperCase()) //TODO: support unicode 6 | return str 7 | } 8 | 9 | const api = function (View) { 10 | /** */ 11 | class Contractions extends View { 12 | constructor(document, pointer, groups) { 13 | super(document, pointer, groups) 14 | this.viewType = 'Contraction' 15 | } 16 | /** i've -> 'i have' */ 17 | expand() { 18 | this.docs.forEach(terms => { 19 | let isTitleCase = titleCase.test(terms[0].text) 20 | terms.forEach((t, i) => { 21 | t.text = t.implicit 22 | delete t.implicit 23 | //add whitespace 24 | if (i < terms.length - 1 && t.post === '') { 25 | t.post += ' ' 26 | } 27 | // flag it as dirty 28 | t.dirty = true 29 | }) 30 | // make the first word title-case? 31 | if (isTitleCase) { 32 | terms[0].text = toTitleCase(terms[0].text) 33 | } 34 | }) 35 | this.compute('normal') //re-set normalized text 36 | return this 37 | } 38 | } 39 | // add fn to View 40 | View.prototype.contractions = function () { 41 | let m = this.match('@hasContraction+') 42 | return new Contractions(this.document, m.pointer) 43 | } 44 | // View.prototype.contract = contract 45 | } 46 | 47 | export default api -------------------------------------------------------------------------------- /src/03-three/numbers/parse/index.js: -------------------------------------------------------------------------------- 1 | import fromText from './fromText.js' 2 | 3 | const fromNumber = function (m) { 4 | let str = m.text('normal').toLowerCase() 5 | str = str.replace(/(e|er)$/, '') 6 | let hasComma = false 7 | if (/,/.test(str)) { 8 | hasComma = true 9 | str = str.replace(/,/g, '') 10 | } 11 | // get prefix/suffix 12 | let arr = str.split(/([-0-9.,]*)/) 13 | let [prefix, num] = arr 14 | let suffix = arr.slice(2).join('') 15 | if (num !== '' && m.length < 2) { 16 | num = Number(num || str) 17 | //ensure that num is an actual number 18 | if (typeof num !== 'number') { 19 | num = null 20 | } 21 | // strip an ordinal off the suffix 22 | if (suffix === 'e' || suffix === 'er') { 23 | suffix = '' 24 | } 25 | } 26 | return { 27 | hasComma, 28 | prefix, 29 | num, 30 | suffix, 31 | } 32 | } 33 | 34 | const parseNumber = function (m) { 35 | let terms = m.docs[0] 36 | let num = null 37 | let prefix = '' 38 | let suffix = '' 39 | let hasComma = false 40 | let isText = m.has('#TextValue') 41 | if (isText) { 42 | num = fromText(terms) 43 | } else { 44 | let res = fromNumber(m) 45 | prefix = res.prefix 46 | suffix = res.suffix 47 | num = res.num 48 | hasComma = res.hasComma 49 | } 50 | return { 51 | hasComma, 52 | prefix, 53 | num, 54 | suffix, 55 | isText, 56 | isOrdinal: m.has('#Ordinal'), 57 | isFraction: m.has('#Fraction'), 58 | isMoney: m.has('#Money'), 59 | } 60 | } 61 | export default parseNumber -------------------------------------------------------------------------------- /plugins/dates/src/phrase/date/units.js: -------------------------------------------------------------------------------- 1 | import spacetime from 'spacetime' 2 | 3 | class Moment { 4 | constructor(input, opts) { 5 | this.unit = 'millisecond' 6 | this.opts = opts || {} 7 | this.s = spacetime(input, opts.timezone) 8 | } 9 | start() { 10 | this.s = this.s.startOf(this.unit) 11 | return this 12 | } 13 | end() { 14 | this.s = this.s.endOf(this.unit) 15 | return this 16 | } 17 | mid() { 18 | //do nothing 19 | return this 20 | } 21 | iso() { 22 | return this.s.iso() 23 | } 24 | } 25 | 26 | 27 | class Day extends Moment { 28 | constructor(str, opts) { 29 | super(str, opts) 30 | this.unit = 'day' 31 | } 32 | mid() { 33 | this.start() 34 | this.s = this.s.add(12, 'hour')//noon 35 | return this 36 | } 37 | } 38 | 39 | class Week extends Moment { 40 | constructor(str, opts) { 41 | super(str, opts) 42 | this.unit = 'week' 43 | } 44 | mid() { 45 | this.start() 46 | this.s = this.s.add(3, 'day')//wednesday 47 | return this 48 | } 49 | } 50 | 51 | class Month extends Moment { 52 | constructor(str, opts) { 53 | super(str, opts) 54 | this.unit = 'month' 55 | } 56 | mid() { 57 | this.start() 58 | this.s = this.s.add(14, 'days') 59 | return this 60 | } 61 | } 62 | 63 | class Year extends Moment { 64 | constructor(str, opts) { 65 | super(str, opts) 66 | this.unit = 'year' 67 | } 68 | mid() { 69 | this.start() 70 | this.s = this.s.add(6, 'months') 71 | return this 72 | } 73 | } 74 | 75 | export { Moment, Month, Day, Week, Year } 76 | -------------------------------------------------------------------------------- /plugins/dates/tests/backburner/ambig-weekday.ignore.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from './_lib.js' 3 | import spacetime from 'spacetime' 4 | 5 | const fmt = (iso) => (iso ? spacetime(iso).format('{iso-short}') : '-') 6 | 7 | test('this monday', function (t) { 8 | let arr = [ 9 | ['2020-12-7', '2020-12-07'], //mon (itself) 10 | ['2020-12-8', '2020-12-14'], //tues 11 | ['2020-12-9', '2020-12-14'], //wed 12 | ['2020-12-10', '2020-12-14'], //thu 13 | ['2020-12-11', '2020-12-14'], //fri 14 | ['2020-12-12', '2020-12-14'], //sat 15 | ['2020-12-13', '2020-12-14'], //sun 16 | ] 17 | arr.forEach((a) => { 18 | let doc = nlp('this monday') 19 | let found = doc.dates({ today: a[0] }).json()[0] 20 | t.equal(fmt(found.dates.start), a[1], 'monday-start') 21 | t.equal(fmt(found.dates.end), a[1], 'monday-end') 22 | }) 23 | t.end() 24 | }) 25 | 26 | // test('last monday', function (t) { 27 | // let arr = [ 28 | // ['2020-12-7', '2020-11-30'], //mon (obvious) 29 | // ['2020-12-8', '2020-11-30'], //tues 30 | // ['2020-12-9', '2020-11-30'], //wed 31 | // ['2020-12-10', '2020-11-30'], //thu 32 | // ['2020-12-11', '2020-11-30'], //fri 33 | // ['2020-12-12', '2020-11-30'], //sat 34 | // ['2020-12-13', '2020-11-30'], //sun 35 | // ] 36 | // arr.forEach((a) => { 37 | // let doc = nlp('last monday') 38 | // let found = doc.dates({ today: a[0] }).json()[0] 39 | // t.equal(fmt(found.date.start), a[1], 'last-monday-start') 40 | // t.equal(fmt(found.date.end), a[1], 'last-monday-end') 41 | // }) 42 | // t.end() 43 | // }) 44 | -------------------------------------------------------------------------------- /data/lexicon/nouns/uncountables.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'anglais', 3 | 'os', 4 | 'bois', 5 | 'corps', 6 | 'bras', 7 | 'poids', 8 | 'repas', 9 | 'sens', 10 | 11 | 12 | 'conseils',//advice 13 | 'munitions',//ammunition 14 | 'asperges',//asparagus 15 | 'combles',//attic 16 | 'spectateurs',//audience 17 | 'auditeurs',// 18 | 'baggage',//luggage 19 | 'bagages',// 20 | 'brocolis',//broccoli 21 | 'affaires',//business 22 | 'dégâts',//damage 23 | 'céréales',//cereal 24 | 'échecs',//chess 25 | 'vêtements',//clothing 26 | 'coordonnées',//address 27 | 'ténèbres',//darkness 28 | 'datadonnées',//** 29 | 'débris',//debris 30 | 'arrhes',//deposit 31 | 'recherches',//research 32 | 'fiançailles',//engagement 33 | 'remords',//remorse 34 | 'victuailles',//food 35 | 'prévisions',//forecast 36 | 'fruits',//fruit 37 | 'funérailles',//funeral 38 | 'obsèques',// 39 | 'meubles',//furniture 40 | 'garbage',//rubbish 41 | 'ordures', 42 | 'déchets',// 43 | 'graffitis',//graffiti 44 | 'cheveux',//hair 45 | 'ravages',//havoc 46 | 'foins',//hay 47 | 'chevrons',//herringbone 48 | 'devoirs',//homework 49 | 'renseignements',//information 50 | 'médicaments',//medicine 51 | 'abats',//offal 52 | 'pâtes',//pasta 53 | 'décombres',//rubble 54 | 'sciences*',//science 55 | 'crevettes',//shrimp 56 | 'logiciels',//software 57 | 'spaghettis',//spaghetti 58 | 'épinards',//spinach 59 | 'parasites',//static 60 | 'transports',//transportation 61 | 'vacances',//vacation 62 | 'environs',//vicinity 63 | 'fumerolles',//gas 64 | 'noces',//wedding 65 | ] -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/1st-pass/year.js: -------------------------------------------------------------------------------- 1 | const min = 1400 2 | const max = 2100 3 | 4 | const dateWords = new Set(['pendant', 'dans', 'avant', 'apres', 'pour', 'en']) 5 | 6 | const seemsGood = function (term) { 7 | if (!term) { 8 | return false 9 | } 10 | if (dateWords.has(term.normal)) { 11 | return true 12 | } 13 | if (term.tags.has('Date') || term.tags.has('Month') || term.tags.has('WeekDay')) { 14 | return true 15 | } 16 | return false 17 | } 18 | 19 | const seemsOkay = function (term) { 20 | if (!term) { 21 | return false 22 | } 23 | if (term.tags.has('Ordinal')) { 24 | return true 25 | } 26 | return false 27 | } 28 | 29 | // recognize '1993' as a year 30 | const tagYear = function (terms, i, world) { 31 | let setTag = world.methods.one.setTag 32 | const term = terms[i] 33 | if (term.tags.has('NumericValue') && term.tags.has('Cardinal') && term.normal.length === 4) { 34 | let num = Number(term.normal) 35 | // number between 1400 and 2100 36 | if (num && !isNaN(num)) { 37 | if (num > min && num < max) { 38 | if (seemsGood(terms[i - 1]) || seemsGood(terms[i + 1])) { 39 | setTag([term], 'Year', world, false, '2-tagYear') 40 | return true 41 | } 42 | // or is it really-close to a year? 43 | if (num > 1950 && num < 2025) { 44 | if (seemsOkay(terms[i - 1]) || seemsOkay(terms[i + 1])) { 45 | setTag([term], 'Year', world, false, '2-tagYear-close') 46 | return true 47 | } 48 | } 49 | } 50 | } 51 | } 52 | return null 53 | } 54 | export default tagYear -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/adj-gender.js: -------------------------------------------------------------------------------- 1 | // maître 2 | // traître 3 | 4 | const guessGender = function (str) { 5 | // female singular 6 | if (str.match(/[eë]$/)) { 7 | return 'f' 8 | } 9 | // female plurals 10 | let suffixes = [ 11 | /[aei]lles$/, 12 | /[aei]les$/, 13 | /[aeiou]ttes$/, 14 | /ntes$/, 15 | /i[vct]es$/, 16 | /uses$/, 17 | /sses$/, 18 | /[èuay]res$/, 19 | /ires$/, 20 | /ées$/, 21 | /ues$/, 22 | /ies$/, 23 | /ée$/, 24 | /[ndvt]es$/, 25 | ] 26 | for (let i = 0; i < suffixes.length; i += 1) { 27 | if (suffixes[i].test(str)) { 28 | return 'f' 29 | } 30 | } 31 | 32 | 33 | return 'm' 34 | } 35 | 36 | // guess a gender tag each Adjective 37 | const adjGender = function (terms, i, world) { 38 | let setTag = world.methods.one.setTag 39 | let term = terms[i] 40 | let tags = term.tags 41 | if (tags.has('Adjective') && !tags.has('FemaleAdjective') && !tags.has('#MaleAdjective')) { 42 | let str = term.implicit || term.normal || term.text || '' 43 | // i actually think there are no exceptions. 44 | if (guessGender(str) === 'f') { 45 | return setTag([term], 'FemaleAdjective', world, false, '3-adj-gender') 46 | } else { 47 | return setTag([term], 'MaleAdjective', world, false, '3-adj-gender') 48 | } 49 | } 50 | return null 51 | } 52 | export default adjGender 53 | 54 | // import data from '../../data/models/adjective/index.js' 55 | // let count = 0 56 | // Object.keys(data).forEach(m => { 57 | // let [f, mp, fp] = data[m] 58 | // if (guessGender(fp) !== 'f') { 59 | // console.log(fp) 60 | // count += 1 61 | // } 62 | // }) 63 | // console.log(count) 64 | -------------------------------------------------------------------------------- /plugins/dates/src/phrase/index.js: -------------------------------------------------------------------------------- 1 | import parseOne from './date/index.js' 2 | import { Moment, Month, Day, Week, Year } from './date/units.js' 3 | 4 | 5 | // generic callback 6 | const startEnd = function (m, opts) { 7 | if (m.found) { 8 | let { start, end } = m.groups() 9 | let out = { 10 | start: parseOne(start, opts), 11 | end: parseOne(end, opts) 12 | } 13 | if (out.start) { 14 | return out 15 | } 16 | } 17 | return null 18 | } 19 | const justStart = function (m, opts) { 20 | let out = { start: parseOne(m, opts) } 21 | if (out.start) { 22 | return out 23 | } 24 | return null 25 | } 26 | 27 | const untilEnd = function (m, opts) { 28 | let { end } = m.groups() 29 | let out = { start: new Moment(opts.today, opts), end: parseOne(end, opts) } 30 | if (out.end) { 31 | // until - just before x 32 | out.end = new Moment(out.end.s.minus(1, 'millisecond'), opts) 33 | return out 34 | } 35 | return null 36 | } 37 | 38 | const phrases = [ 39 | // 'entre sept et oct' 40 | { match: 'entre [.*] et [.*]', cb: startEnd }, 41 | // 'jusqu'en juin' (until june) 42 | { match: '(jusqu|jusque) (en|a|à|au) [#Date+]', cb: untilEnd }, 43 | // fallback to parsing one date 44 | { match: '.*', cb: justStart }, 45 | ] 46 | 47 | const parsePhrase = function (matches, opts) { 48 | let arr = [] 49 | matches.forEach(view => { 50 | for (let i = 0; i < phrases.length; i += 1) { 51 | let { match, cb } = phrases[i] 52 | let m = view.match(match) 53 | if (m.found) { 54 | let res = cb(m, opts) 55 | if (res) { 56 | arr.push(res) 57 | return 58 | } 59 | } 60 | } 61 | 62 | 63 | }) 64 | return arr 65 | } 66 | export default parsePhrase -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "author": "Spencer Kelly (http://spencermounta.in)", 3 | "name": "fr-compromise", 4 | "description": "Linguistique computationnelle modeste", 5 | "version": "0.2.8", 6 | "main": "./builds/fr-compromise.mjs", 7 | "unpkg": "./builds/fr-compromise.min.js", 8 | "type": "module", 9 | "sideEffects": false, 10 | "exports": { 11 | ".": { 12 | "import": "./builds/fr-compromise.mjs", 13 | "require": "./builds/fr-compromise.cjs", 14 | "types": "./types/index.d.ts" 15 | } 16 | }, 17 | "types": "types/index.d.ts", 18 | "repository": { 19 | "type": "git", 20 | "url": "git://github.com/nlp-compromise/fr-compromise.git" 21 | }, 22 | "scripts": { 23 | "test": "tape \"./tests/**/*.test.js\" | tap-dancer", 24 | "testb": "cross-env TESTENV=prod npm run test", 25 | "build": "npm run version && rollup -c --silent", 26 | "pack": "node ./scripts/pack.js", 27 | "watch": "amble ./scratch.js", 28 | "version": "node ./scripts/version.js", 29 | "score": "node ./learn/giga/test.js", 30 | "lint": "eslint ./src/**/*", 31 | "stress": "node scripts/stress.js" 32 | }, 33 | "files": [ 34 | "builds/", 35 | "types/", 36 | "src/" 37 | ], 38 | "dependencies": { 39 | "compromise": "14.10.0", 40 | "efrt": "2.7.0", 41 | "suffix-thumb": "5.0.2" 42 | }, 43 | "devDependencies": { 44 | "@rollup/plugin-node-resolve": "15.2.0", 45 | "@rollup/plugin-terser": "0.4.3", 46 | "amble": "1.3.0", 47 | "cross-env": "^7.0.3", 48 | "eslint": "8.47.0", 49 | "eslint-plugin-regexp": "1.15.0", 50 | "fr-corpus": "^0.0.1", 51 | "rollup": "3.28.0", 52 | "tap-dancer": "0.3.4", 53 | "tape": "5.6.6" 54 | }, 55 | "license": "MIT" 56 | } 57 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import nlp from './_lib.js' 2 | import tokenize from './01-one/tokenize/plugin.js' 3 | import lexicon from './01-one/lexicon/plugin.js' 4 | import preTagger from './02-two/preTagger/plugin.js' 5 | import postTagger from './02-two/postTagger/plugin.js' 6 | import tagset from './02-two/tagset/plugin.js' 7 | import numbers from './03-three/numbers/plugin.js' 8 | import topics from './03-three/topics/plugin.js' 9 | import verbs from './03-three/verbs/plugin.js' 10 | import adjectives from './03-three/adjectives/plugin.js' 11 | import nouns from './03-three/nouns/plugin.js' 12 | import contractions from './03-three/contractions/plugin.js' 13 | import version from './_version.js' 14 | 15 | nlp.plugin(tokenize) 16 | nlp.plugin(tagset) 17 | nlp.plugin(lexicon) 18 | nlp.plugin(preTagger) 19 | nlp.plugin(postTagger) 20 | nlp.plugin(numbers) 21 | nlp.plugin(topics) 22 | nlp.plugin(verbs) 23 | nlp.plugin(adjectives) 24 | nlp.plugin(nouns) 25 | nlp.plugin(contractions) 26 | 27 | const fr = function (txt, lex) { 28 | let dok = nlp(txt, lex) 29 | return dok 30 | } 31 | 32 | // copy constructor methods over 33 | Object.keys(nlp).forEach(k => { 34 | if (nlp.hasOwnProperty(k)) { 35 | fr[k] = nlp[k] 36 | } 37 | }) 38 | 39 | // this one is hidden 40 | Object.defineProperty(fr, '_world', { 41 | value: nlp._world, 42 | writable: true, 43 | }) 44 | 45 | 46 | 47 | /** log the decision-making to console */ 48 | fr.verbose = function (set) { 49 | let env = typeof process === 'undefined' ? self.env || {} : process.env //use window, in browser 50 | env.DEBUG_TAGS = set === 'tagger' || set === true ? true : '' 51 | env.DEBUG_MATCH = set === 'match' || set === true ? true : '' 52 | env.DEBUG_CHUNKS = set === 'chunker' || set === true ? true : '' 53 | return this 54 | } 55 | fr.version = version 56 | 57 | export default fr -------------------------------------------------------------------------------- /learn/giga/getPairs.js: -------------------------------------------------------------------------------- 1 | import { forEachSync } from './_giga.js' 2 | import doSentences from './french.js' 3 | import fs from 'fs' 4 | 5 | let ids = [] 6 | for (let i = 1; i <= 10; i += 1) { 7 | let str = String(i).padStart(4, '0') 8 | ids.push(str) 9 | } 10 | // ids = ['0004'] 11 | 12 | // ABR abbreviation 13 | // ADJ adjective 14 | // ADV adverb 15 | 16 | // VER:pres verb present 17 | // VER:simp verb simple past 18 | // VER:futu verb futur 19 | // VER:cond verb conditional 20 | // VER:impe verb imperative 21 | // VER:impf verb imperfect 22 | // VER:infi verb infinitive 23 | // VER:pper verb past participle 24 | // VER:ppre verb present participle 25 | // VER:subi verb subjunctive imperfect 26 | // VER:subp verb subjunctive present 27 | 28 | // "NOM": true, 29 | let pairs = {} 30 | const tag = 'NOM' 31 | // const prev = 'les' 32 | 33 | let results = {} 34 | const doBoth = function (both) { 35 | let terms = both.fr 36 | terms.forEach((term, i) => { 37 | if (i === 0) { 38 | return 39 | } 40 | if (term['$'].pos === tag) { 41 | console.log(term) 42 | // let last = terms[i - 1]['$text'].toLowerCase() 43 | // if (last === prev) { 44 | // let w = term['$text'] 45 | // let inf = term['$'].lem 46 | // // console.log(last, w, inf) 47 | // if (w && inf) { 48 | // w = w.toLowerCase().trim() 49 | // inf = inf.toLowerCase().trim() 50 | // results[w] = inf 51 | // } 52 | // } 53 | } 54 | }) 55 | } 56 | 57 | await forEachSync(ids, async id => { 58 | try { 59 | console.log(`\ndoing ${id}:\n`) 60 | await doSentences(id, doBoth) 61 | } catch (e) { 62 | console.log(e) 63 | } 64 | }) 65 | console.log('done') 66 | results = Object.entries(results) 67 | fs.writeFileSync('./pairs.js', 'export default ' + JSON.stringify(results)) 68 | -------------------------------------------------------------------------------- /types/view/fr.ts: -------------------------------------------------------------------------------- 1 | import View from './one' 2 | 3 | 4 | interface Numbers extends View { 5 | /** grab the parsed number */ 6 | parse: (n?: number) => object[] 7 | /** grab the parsed number */ 8 | get: (n?: number) => number | number[] 9 | /** grab 'kilos' from `25 kilos' */ 10 | // units: () => View 11 | /** return only ordinal numbers */ 12 | isOrdinal: () => View 13 | /** return only cardinal numbers */ 14 | isCardinal: () => View 15 | /** convert number to `5` or `5th` */ 16 | toNumber: () => View 17 | /** add commas, or nicer formatting for numbers */ 18 | toLocaleString: () => View 19 | /** convert number to `five` or `fifth` */ 20 | toText: () => View 21 | /** convert number to `five` or `5` */ 22 | toCardinal: () => View 23 | /** convert number to `fifth` or `5th` */ 24 | toOrdinal: () => View 25 | /** return numbers with this value */ 26 | isEqual: () => View 27 | /** return numbers bigger than n */ 28 | greaterThan: (min: number) => View 29 | /** return numbers smaller than n */ 30 | lessThan: (max: number) => View 31 | /** return numbers between min and max */ 32 | between: (min: number, max: number) => View 33 | /** set number to n */ 34 | set: (n: number) => View 35 | /** increase number by n */ 36 | add: (n: number) => View 37 | /** decrease number by n*/ 38 | subtract: (n: number) => View 39 | /** increase number by 1 */ 40 | increment: () => View 41 | /** decrease number by 1*/ 42 | decrement: () => View 43 | } 44 | 45 | interface Contractions extends View { 46 | /** */ 47 | expand(): View 48 | } 49 | 50 | 51 | 52 | interface FrView extends View { 53 | /** return any multi-word terms, like "didn't" */ 54 | contractions: (n?: number) => Contractions 55 | /** */ 56 | numbers(): Numbers 57 | /** */ 58 | topics(): View 59 | } 60 | 61 | export default FrView -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/verb-tense.js: -------------------------------------------------------------------------------- 1 | const tenses = [ 2 | 'PresentTense', 3 | 'Infinitive', 4 | 'Imperative', 5 | 'Gerund', 6 | 'PastTense', 7 | 'Modal', 8 | 'Auxiliary', 9 | 'PerfectTense', 10 | 'Pluperfect', 11 | 'ConditionalVerb', 12 | 'FutureTense', 13 | ] 14 | 15 | 16 | let whichTense = [ 17 | 18 | //er - present conditional 19 | ['erais', 'ConditionalVerb'], 20 | ['erait', 'ConditionalVerb'], 21 | ['erions', 'ConditionalVerb'], 22 | ['eriez', 'ConditionalVerb'], 23 | ['eraient', 'ConditionalVerb'], 24 | 25 | //er- future 26 | ['erai', 'FutureTense'], 27 | ['era', 'FutureTense'], 28 | ['erons', 'FutureTense'], 29 | ['erez', 'FutureTense'], 30 | ['eront', 'FutureTense'], 31 | 32 | // er - imparfait -> PastTense 33 | ['ais', 'PastTense'], 34 | ['ait', 'PastTense'], 35 | ['ions', 'PastTense'], 36 | ['iez', 'PastTense'], 37 | ['ient', 'PastTense'], 38 | 39 | // past-participle 40 | ['ées', 'PastParticiple'], 41 | ['és', 'PastParticiple'], 42 | ['ée', 'PastParticiple'], 43 | ['é', 'Participle'], 44 | ['u', 'Participle'],//entendu 45 | ] 46 | 47 | 48 | // guess a tense tag each Verb 49 | const verbTense = function (terms, i, world) { 50 | let setTag = world.methods.one.setTag 51 | let term = terms[i] 52 | let tags = term.tags 53 | if (tags.has('Verb')) { 54 | // console.log(term) 55 | let str = term.implicit || term.normal || term.text || '' 56 | // if we have no tense 57 | if (!tenses.find(s => tags.has(s))) { 58 | let found = whichTense.find(a => str.endsWith(a[0])) 59 | if (found) { 60 | setTag([term], found[1], world, false, '3-tense-suffix-' + found[1]) 61 | } else { 62 | setTag([term], 'PresentTense', world, false, '3-tense-fallback') 63 | } 64 | } 65 | } 66 | return null 67 | } 68 | export default verbTense -------------------------------------------------------------------------------- /learn/wikinews/getSuffix.js: -------------------------------------------------------------------------------- 1 | let lines = require('./parse') 2 | // lines = lines.slice(0, 300) 3 | const end = 5 4 | 5 | // 'P+D': 241, 6 | // ADJ: 719, 7 | // ADV: 311, 8 | // CC: 172, 9 | // CLO: 32, 10 | // CLR: 53, 11 | // CLS: 88, 12 | // CS: 90, 13 | // DET: 1353, 14 | // ET: 136, 15 | 16 | // nouns: 17 | // NC: 1877, 18 | // NPP: 493, 19 | // P: 1242, 20 | // PREF: 8, 21 | 22 | // PRO: 43, //pronoun 23 | // PROREL: 89, //relative pronoun 24 | // U: 100, 25 | 26 | // V: 509, 27 | // VINF: 140, 28 | // VPP: 402, 29 | // VPR: 61, 30 | // VS: 10, 31 | 32 | let tags = {} 33 | lines.forEach((s) => { 34 | s.forEach((w) => { 35 | let len = w.word.length 36 | if (len <= end) { 37 | return 38 | } 39 | let suffix = w.word.toLowerCase().substr(len - end, len) 40 | // suffix = suffix.replace(/[éèêë]/, 'e') 41 | // suffix = suffix.replace(/[ï]/, 'i') 42 | // suffix = suffix.replace(/[û]/, 'u') 43 | if (suffix.match(/[0-9]/)) { 44 | return 45 | } 46 | tags[suffix] = tags[suffix] || {} 47 | tags[suffix][w.tag] = tags[suffix][w.tag] || 0 48 | tags[suffix][w.tag] += 1 49 | }) 50 | }) 51 | 52 | let found = {} 53 | const wantTag = 'N' 54 | Object.keys(tags).forEach((k) => { 55 | let foundTags = Object.keys(tags[k]) 56 | if (foundTags.length === 2 && tags[k][wantTag] > 5) { 57 | foundTags.forEach((tag) => { 58 | if (tags[k][tag] === 1) { 59 | delete tags[k][tag] 60 | } 61 | }) 62 | foundTags = Object.keys(tags[k]) 63 | // console.log(tags[k]) 64 | // console.log(foundTags) 65 | } 66 | if (foundTags.length === 1) { 67 | let count = tags[k][foundTags[0]] 68 | if (count > 1 && foundTags[0] === wantTag) { 69 | if (tags[k][wantTag] > 90) { 70 | // console.log(tags[k]) 71 | found[k] = foundTags[0] 72 | } 73 | // console.log(k+':' foundTags[0], count) 74 | } 75 | } 76 | }) 77 | console.log(found) 78 | -------------------------------------------------------------------------------- /tests/conjugate.test.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from './_lib.js' 3 | let here = '[conjugate] ' 4 | nlp.verbose(false) 5 | 6 | test('adj-conjugate:', function (t) { 7 | let all = ["sanglant", "sanglante", "sanglants", "sanglantes"] 8 | t.deepEqual(Object.values(nlp(all[0]).adjectives().conjugate()[0]), all, here + 'from-male') 9 | t.deepEqual(Object.values(nlp(all[1]).adjectives().conjugate()[0]), all, here + 'from-female') 10 | t.deepEqual(Object.values(nlp(all[2]).adjectives().conjugate()[0]), all, here + 'from-plural') 11 | t.deepEqual(Object.values(nlp(all[3]).adjectives().conjugate()[0]), all, here + 'from-female-plural') 12 | t.end() 13 | }) 14 | 15 | test('noun-conjugate:', function (t) { 16 | let all = ["cargaison", "cargaisons"] 17 | let o = nlp(all[0]).nouns().conjugate()[0] 18 | t.deepEqual([o.singular, o.plural], all, here + 'from-sing') 19 | o = nlp(all[1]).nouns().conjugate()[0] 20 | t.deepEqual([o.singular, o.plural], all, here + 'from-plural') 21 | 22 | all = ["bois", "bois"] 23 | o = nlp(all[0]).nouns().conjugate()[0] 24 | t.deepEqual([o.singular, o.plural], all, here + 'from-sing') 25 | t.end() 26 | }) 27 | 28 | test('verb-conjugate:', function (t) { 29 | let all = ["endors", "endors", "endort", "endormons", "endormez", "endorment"] 30 | t.deepEqual(Object.values(nlp(all[0]).verbs().conjugate()[0].PresentTense), all, here + 'from-first') 31 | t.deepEqual(Object.values(nlp(all[1]).verbs().conjugate()[0].PresentTense), all, here + 'from-2nd') 32 | t.deepEqual(Object.values(nlp(all[2]).verbs().conjugate()[0].PresentTense), all, here + 'from-3d') 33 | t.deepEqual(Object.values(nlp(all[3]).verbs().conjugate()[0].PresentTense), all, here + 'from-1p') 34 | t.deepEqual(Object.values(nlp(all[4]).verbs().conjugate()[0].PresentTense), all, here + 'from-2p') 35 | t.deepEqual(Object.values(nlp(all[5]).verbs().conjugate()[0].PresentTense), all, here + 'from-3p') 36 | t.end() 37 | }) -------------------------------------------------------------------------------- /src/01-one/tokenize/unicode.js: -------------------------------------------------------------------------------- 1 | //a hugely-ignorant, and widely subjective transliteration of latin, cryllic, greek unicode characters to english ascii. 2 | //approximate visual (not semantic or phonetic) relationship between unicode and ascii characters 3 | //http://en.wikipedia.org/wiki/List_of_Unicode_characters 4 | //https://docs.google.com/spreadsheet/ccc?key=0Ah46z755j7cVdFRDM1A2YVpwa1ZYWlpJM2pQZ003M0E 5 | 6 | 7 | // allowed french symbols 8 | // ç – la cédille (the cedilla) 9 | // é – l'accent aigu (the acute accent) 10 | // â/ê/î/ô/û – l'accent circonflexe (the circumflex) 11 | // à/è/ì/ò/ù – l'accent grave (the grave accent) 12 | // ë/ï/ü 13 | let compact = { 14 | '!': '¡', 15 | '?': '¿Ɂ', 16 | '"': '“”"❝❞', 17 | "'": '‘‛❛❜’', 18 | '-': '—–', 19 | a: 'ªÁÃÄÅáãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧȺΆΑΔΛάαλАаѦѧӐӑӒӓƛæ', 20 | b: 'ßþƀƁƂƃƄƅɃΒβϐϦБВЪЬвъьѢѣҌҍ', 21 | c: '¢©ĆćĈĉĊċČčƆƇƈȻȼͻͼϲϹϽϾСсєҀҁҪҫ', 22 | d: 'ÐĎďĐđƉƊȡƋƌ', 23 | e: 'ĒēĔĕĖėĘęĚěƐȄȅȆȇȨȩɆɇΈΕΞΣέεξϵЀЁЕеѐёҼҽҾҿӖӗ', 24 | f: 'ƑƒϜϝӺӻҒғſ', 25 | g: 'ĜĝĞğĠġĢģƓǤǥǦǧǴǵ', 26 | h: 'ĤĥĦħƕǶȞȟΉΗЂЊЋНнђћҢңҤҥҺһӉӊ', 27 | I: 'Í', 28 | i: 'íĨĩĪīĬĭĮįİıƖƗȈȉȊȋΊΐΪίιϊІЇії', 29 | j: 'ĴĵǰȷɈɉϳЈј', 30 | k: 'ĶķĸƘƙǨǩΚκЌЖКжкќҚқҜҝҞҟҠҡ', 31 | l: 'ĹĺĻļĽľĿŀŁłƚƪǀǏǐȴȽΙӀӏ', 32 | m: 'ΜϺϻМмӍӎ', 33 | n: 'ÑñŃńŅņŇňʼnŊŋƝƞǸǹȠȵΝΠήηϞЍИЙЛПийлпѝҊҋӅӆӢӣӤӥπ', 34 | o: 'ÓÕÖØðóõöøŌōŎŏŐőƟƠơǑǒǪǫǬǭǾǿȌȍȎȏȪȫȬȭȮȯȰȱΌΘΟθοσόϕϘϙϬϴОФоѲѳӦӧӨөӪӫ', 35 | p: 'ƤΡρϷϸϼРрҎҏÞ', 36 | q: 'Ɋɋ', 37 | r: 'ŔŕŖŗŘřƦȐȑȒȓɌɍЃГЯгяѓҐґ', 38 | s: 'ŚśŜŝŞşŠšƧƨȘșȿЅѕ', 39 | t: 'ŢţŤťŦŧƫƬƭƮȚțȶȾΓΤτϮТт', 40 | u: 'µÚúŨũŪūŬŭŮůŰűŲųƯưƱƲǓǔǕǖǗǘǙǚǛǜȔȕȖȗɄΰμυϋύ', 41 | v: 'νѴѵѶѷ', 42 | w: 'ŴŵƜωώϖϢϣШЩшщѡѿ', 43 | x: '×ΧχϗϰХхҲҳӼӽӾӿ', 44 | y: 'ÝýÿŶŷŸƳƴȲȳɎɏΎΥΫγψϒϓϔЎУучўѰѱҮүҰұӮӯӰӱӲӳ', 45 | z: 'ŹźŻżŽžƵƶȤȥɀΖ', 46 | oe: 'œ', 47 | } 48 | //decompress data into two hashes 49 | let unicode = {} 50 | Object.keys(compact).forEach(function (k) { 51 | compact[k].split('').forEach(function (s) { 52 | unicode[s] = k 53 | }) 54 | }) 55 | 56 | export default unicode -------------------------------------------------------------------------------- /data/lexicon/nouns/masculine.js: -------------------------------------------------------------------------------- 1 | export default ['bateau', 'parapluie', 2 | 3 | 4 | 'échelle', 5 | 'végétale', 6 | 'automobile', 7 | 'file', 8 | 'mobile', 9 | 'année', 10 | 'musée', 11 | 'idée', 12 | 13 | 'pratique', 14 | 'statistique', 15 | 'politique', 16 | 'musique', 17 | 'technique', 18 | 19 | 'table', 20 | 'ensemble', 21 | 'bénéficiaire', 22 | 'commentaire', 23 | 'affaire', 24 | 'partenaire', 25 | 'gestionnaire', 26 | 'fonctionnaire', 27 | 'salaire', 28 | 29 | 'animal', 30 | 'taux', 31 | 'niveau', 32 | 'réseau', 33 | 'bureau', 34 | 'journal', 35 | 'eau', 36 | 37 | 'entente', 38 | 'vente', 39 | 'atteinte', 40 | 'plante', 41 | 'plainte', 42 | 43 | 'jeu', 44 | // 'enjeux', 45 | 'lieu', 46 | 47 | 'perspective', 48 | 'initiative', 49 | 'élève', 50 | 51 | 'objectif', 52 | 'tarif', 53 | 54 | 'avenir', 55 | 'air', 56 | 57 | 'janvier', 58 | 'hiver', 59 | 'mer', 60 | 'dossier', 61 | 'degré', 62 | 63 | 'droit', 64 | 'crédit', 65 | 'profit', 66 | 'endroit', 67 | 68 | 69 | 70 | 'gouvernement', 71 | 'développement', 72 | 'financement', 73 | 'enseignement', 74 | 'rendement', 75 | 'environnement', 76 | 'établissement', 77 | 'enregistrement', 78 | 'document', 79 | 'investissement', 80 | 'moment', 81 | 'règlement', 82 | 'traitement', 83 | 'engagement', 84 | 'paiement', 85 | 'approvisionnement', 86 | 'changement', 87 | 'élément', 88 | 'équipement', 89 | 'événement', 90 | 'fonctionnement', 91 | 'parlement', 92 | 'perfectionnement', 93 | 'agrément', 94 | 'accroissement', 95 | 'renforcement', 96 | 'renouvellement', 97 | 'recensement', 98 | 'remboursement', 99 | 'segment', 100 | 'recrutement', 101 | 'mouvement', 102 | 103 | 'donnée', 104 | 'restaurant', 105 | 'espace', 106 | 107 | 108 | ] 109 | -------------------------------------------------------------------------------- /types/index.d.ts: -------------------------------------------------------------------------------- 1 | import { Lexicon, Plugin, matchOptions, Match, Net } from './misc' 2 | import View from './view/fr' 3 | 4 | /** parse a given text */ 5 | declare function nlp(text: string, lexicon?: Lexicon): View 6 | 7 | // Constructor 8 | declare module nlp { 9 | /** interpret text without tagging */ 10 | export function tokenize(text: string, lexicon?: Lexicon): View 11 | /** scan through text with minimal analysis */ 12 | export function lazy(text: string, match?: string): View 13 | /** mix-in a compromise plugin */ 14 | export function plugin(plugin: Plugin): any 15 | /** mix-in a compromise plugin */ 16 | export function extend(plugin: Plugin): any 17 | /** turn a match-string into json */ 18 | export function parseMatch(match: string, opts?: matchOptions): object[] 19 | /** grab library internals */ 20 | export function world(): object 21 | /** grab library metadata */ 22 | export function model(): object 23 | /** grab exposed library methods */ 24 | export function methods(): object 25 | /** which compute functions run automatically */ 26 | export function hooks(): string[] 27 | /** log our decision-making for debugging */ 28 | export function verbose(toLog?: boolean | string): any 29 | /** current semver version of the library */ 30 | export const version: string 31 | /** connect new tags to tagset graph */ 32 | export function addTags(tags: object): any 33 | /** add new words to internal lexicon */ 34 | export function addWords(words: Lexicon): any 35 | /** turn a list of words into a searchable graph */ 36 | export function buildTrie(words: string[]): object 37 | /** compile a set of match objects to a more optimized form */ 38 | export function buildNet(matches: Match[]): Net 39 | /** add words to the autoFill dictionary */ 40 | export function typeahead(words: Lexicon): any 41 | /** export internal methods for plugins */ 42 | export interface TypedPlugin extends Plugin { methods: Methods } 43 | } 44 | 45 | export default nlp 46 | 47 | -------------------------------------------------------------------------------- /src/02-two/preTagger/model/suffixes.js: -------------------------------------------------------------------------------- 1 | const rb = 'Adverb' 2 | const nn = 'Noun' 3 | const vb = 'Verb' 4 | const jj = 'Adjective' 5 | const inf = 'Infinitive' 6 | // const pres = 'PresentTense' 7 | 8 | 9 | export default [ 10 | null, 11 | null, 12 | { 13 | //2-letter 14 | ce: nn,//connaissance 15 | ge: nn, 16 | ie: nn, 17 | 18 | er: inf, 19 | ir: inf, 20 | ée: vb, 21 | és: vb, 22 | sé: vb, 23 | ré: vb, 24 | çu: vb,//conçu 25 | ra: vb,//faudra 26 | it: vb,//fournit 27 | ez: vb,//consultez 28 | 29 | if: jj,//descriptif 30 | }, 31 | { 32 | //3-letter 33 | ité: nn, //qualité 34 | eur: nn,//directeur 35 | ces: nn,//connaissances 36 | 37 | ées: vb,//énoncées 38 | ait: vb,//devrait 39 | era: vb,//aidera 40 | ser: vb,//utiliser 41 | ter: vb,//adopter 42 | 43 | ive: jj, // 44 | ifs: jj, //relatifs 45 | ile: jj, //civile 46 | ale: jj, //nationale 47 | ble: jj, //capable 48 | aux: jj, //nationaux 49 | eux: jj, //précieux 50 | nte: jj, //différente 51 | }, 52 | { 53 | //4-letter 54 | ment: rb, 55 | 56 | elle: jj, 57 | bles: jj, 58 | ales: jj, 59 | ique: jj, 60 | aire: jj, 61 | ives: jj, 62 | ntes: jj, //différentes 63 | 64 | sent: vb,//produisent 65 | 66 | sion: nn,//commission 67 | eurs: nn,//directeurs 68 | tion: nn,//amélioration 69 | ance: nn,//croissance 70 | euse: jj,//rigoureuse 71 | ouce: jj//douce 72 | }, 73 | { 74 | //5-letter 75 | tions: nn,//améliorations 76 | ments: nn,//aliments 77 | sions: nn,//commissions 78 | 79 | aient: vb,//auraient 80 | arant: vb,//préparant 81 | irant: vb,//inspirant 82 | orant: vb,//élaborant 83 | urant: vb,//assurant 84 | trant: vb,//montrant 85 | llant: vb,//détaillant 86 | 87 | ouces: jj,//douces 88 | elles: jj, 89 | iques: jj, 90 | aires: jj, 91 | euses: jj 92 | }, 93 | { 94 | //6-letter 95 | }, 96 | { 97 | //7-letter 98 | }, 99 | ] -------------------------------------------------------------------------------- /src/03-three/nouns/api.js: -------------------------------------------------------------------------------- 1 | export const getNth = (doc, n) => (typeof n === 'number' ? doc.eq(n) : doc) 2 | 3 | // get root form of adjective 4 | const getRoot = function (m) { 5 | m.compute('root') 6 | let str = m.text('root') 7 | // let isPlural = m.has('#PluralNoun') 8 | // if (isPlural) { 9 | // return transform.adjective.fromPlural(str) 10 | // } 11 | return str 12 | } 13 | 14 | const api = function (View) { 15 | class Nouns extends View { 16 | constructor(document, pointer, groups) { 17 | super(document, pointer, groups) 18 | this.viewType = 'Nouns' 19 | } 20 | conjugate(n) { 21 | const methods = this.methods.two.transform.noun 22 | return getNth(this, n).map(m => { 23 | let str = m.text() 24 | if (m.has('#PluralNoun')) { 25 | return { 26 | plural: str, 27 | singular: methods.fromPlural(str) 28 | } 29 | } 30 | if (m.has('#Uncountable')) { 31 | return { 32 | singular: str, 33 | plural: str, 34 | } 35 | } 36 | return { 37 | singular: str, 38 | plural: methods.toPlural(str) 39 | } 40 | }, []) 41 | } 42 | isPlural(n) { 43 | return getNth(this, n).if('#PluralNoun') 44 | } 45 | toPlural(n) { 46 | const methods = this.methods.two.transform.noun 47 | return getNth(this, n).if('#Singular').map(m => { 48 | let str = getRoot(m) 49 | let plural = methods.toPlural(str) 50 | return m.replaceWith(plural) 51 | }) 52 | } 53 | toSingular(n) { 54 | const methods = this.methods.two.transform.noun 55 | return getNth(this, n).if('#PluralNoun').map(m => { 56 | let str = getRoot(m) 57 | let singular = methods.fromPlural(str) 58 | return m.replaceWith(singular) 59 | }) 60 | } 61 | } 62 | 63 | View.prototype.nouns = function (n) { 64 | let m = this.match('#Noun') 65 | m = getNth(m, n) 66 | return new Nouns(this.document, m.pointer) 67 | } 68 | } 69 | export default api -------------------------------------------------------------------------------- /src/02-two/tagset/tags/misc.js: -------------------------------------------------------------------------------- 1 | const anything = ['Noun', 'Verb', 'Adjective', 'Adverb', 'Value', 'QuestionWord'] 2 | 3 | export default { 4 | Adjective: { 5 | not: ['Noun', 'Verb', 'Adverb', 'Value'], 6 | }, 7 | Comparable: { 8 | is: 'Adjective', 9 | }, 10 | Comparative: { 11 | is: 'Adjective', 12 | }, 13 | Superlative: { 14 | is: 'Adjective', 15 | not: ['Comparative'], 16 | }, 17 | MaleAdjective: { 18 | is: 'Adjective', 19 | not: ['FemaleAdjective'], 20 | }, 21 | FemaleAdjective: { 22 | is: 'Adjective', 23 | not: ['MaleAdjective'], 24 | }, 25 | PluralAdjective: { 26 | is: 'Adjective', 27 | }, 28 | NumberRange: {}, 29 | Adverb: { 30 | not: ['Noun', 'Verb', 'Adjective', 'Value'], 31 | }, 32 | 33 | Determiner: { 34 | not: ['Noun', 'Verb', 'Adjective', 'Adverb', 'QuestionWord', 'Conjunction', 'Preposition'], //allow 'a' to be a Determiner/Value 35 | }, 36 | Conjunction: { 37 | not: anything, 38 | }, 39 | Preposition: { 40 | not: ['Noun', 'Verb', 'Adjective', 'Adverb', 'QuestionWord'], 41 | }, 42 | QuestionWord: { 43 | not: ['Determiner'], 44 | }, 45 | Currency: { 46 | is: 'Noun', 47 | }, 48 | Expression: { 49 | not: ['Noun', 'Adjective', 'Verb', 'Adverb'], 50 | }, 51 | Abbreviation: {}, 52 | Url: { 53 | not: ['HashTag', 'PhoneNumber', 'Verb', 'Adjective', 'Value', 'AtMention', 'Email'], 54 | }, 55 | PhoneNumber: { 56 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention', 'Email'], 57 | }, 58 | HashTag: {}, 59 | AtMention: { 60 | is: 'Noun', 61 | not: ['HashTag', 'Email'], 62 | }, 63 | Emoji: { 64 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention'], 65 | }, 66 | Emoticon: { 67 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention'], 68 | }, 69 | Email: { 70 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention'], 71 | }, 72 | Acronym: { 73 | not: ['PluralNoun', 'RomanNumeral'], 74 | }, 75 | Negative: { 76 | not: ['Noun', 'Adjective', 'Value'], 77 | }, 78 | Condition: { 79 | not: ['Verb', 'Adjective', 'Noun', 'Value'], 80 | }, 81 | } 82 | -------------------------------------------------------------------------------- /scripts/pack.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-console */ 2 | import fs from 'fs' 3 | import { pack } from 'efrt' 4 | import { learn, compress } from 'suffix-thumb' 5 | import lexicon from '../data/lexicon/index.js' 6 | import models from '../data/models/index.js' 7 | // import switches from '../lib/switches/index.js' 8 | // import senses from '../lib/senses/index.js' 9 | 10 | const steps = [ 11 | { 12 | label: 'lexicon', 13 | path: './src/01-one/lexicon/model/_data.js', 14 | compress: function () { 15 | let packed = {} 16 | //turn them into a series of flat-arrays 17 | Object.keys(lexicon).forEach(word => { 18 | let tags = lexicon[word] 19 | if (typeof tags === 'string') { 20 | tags = [tags] 21 | } 22 | tags.forEach(tag => { 23 | packed[tag] = packed[tag] || [] 24 | packed[tag].push(word) 25 | }) 26 | }) 27 | //pack each array into a tiny string 28 | Object.keys(packed).forEach(tag => { 29 | packed[tag] = pack(packed[tag]) 30 | }) 31 | return packed 32 | }, 33 | }, 34 | { 35 | label: 'models', 36 | path: './src/01-one/lexicon/methods/_data.js', 37 | compress: function () { 38 | let packed = {} 39 | Object.keys(models).forEach(k => { 40 | packed[k] = {} 41 | Object.keys(models[k]).forEach(form => { 42 | let pairs = models[k][form] 43 | console.log(k, form) 44 | packed[k][form] = learn(pairs) 45 | packed[k][form] = compress(packed[k][form]) 46 | }) 47 | }) 48 | return packed 49 | }, 50 | } 51 | ] 52 | 53 | // run through all our steps 54 | steps.forEach(obj => { 55 | console.log(`\n 🕑 - packing ${obj.label}..`) 56 | const packed = obj.compress() 57 | 58 | //write it to a file in ./src 59 | const banner = `// generated in ./lib/${obj.label}\n` 60 | fs.writeFileSync(obj.path, banner + 'export default ' + JSON.stringify(packed, null, 2), 'utf8') 61 | 62 | //get filesize 63 | const stats = fs.statSync(obj.path) 64 | let size = (stats.size / 1000.0).toFixed(1) 65 | console.log(` - ${obj.label} is ` + size + 'k\n') 66 | }) 67 | -------------------------------------------------------------------------------- /src/01-one/lexicon/model/misc.js: -------------------------------------------------------------------------------- 1 | export default { 2 | // copulas (incomplete) 3 | es: ['Copula', 'PresentTense'], 4 | est: ['Copula', 'PresentTense'], 5 | suis: ['Copula', 'PresentTense'], 6 | sommes: ['Copula', 'PresentTense'], 7 | etes: ['Copula', 'PresentTense'], 8 | sont: ['Copula', 'PresentTense'], 9 | 10 | ete: ['Copula', 'PastTense'], 11 | etais: ['Copula', 'PastTense'], 12 | etions: ['Copula', 'PastTense'], 13 | 14 | serons: ['Copula', 'FutureTense'], 15 | seront: ['Copula', 'FutureTense'], 16 | serai: ['Copula', 'FutureTense'], 17 | 18 | cent: ['Multiple', 'Cardinal'], 19 | mille: ['Multiple', 'Cardinal'], 20 | million: ['Multiple', 'Cardinal'], 21 | milliard: ['Multiple', 'Cardinal'], 22 | quadrillion: ['Multiple', 'Cardinal'], 23 | centième: ['Multiple', 'Ordinal'], 24 | millième: ['Multiple', 'Ordinal'], 25 | millionième: ['Multiple', 'Ordinal'], 26 | milliardième: ['Multiple', 'Ordinal'], 27 | billionième: ['Multiple', 'Ordinal'], 28 | trillionième: ['Multiple', 'Ordinal'], 29 | // plural numbers 30 | septs: ['TextValue', 'Cardinal'], 31 | 32 | cents: ['Multiple', 'Cardinal'], 33 | milles: ['Multiple', 'Cardinal'], 34 | millions: ['Multiple', 'Cardinal'], 35 | milliards: ['Multiple', 'Cardinal'], 36 | 37 | êtes: ['Copula', 'PresentTense'], 38 | étions: ['Copula', 'PresentTense'], 39 | serez: ['Copula', 'PresentTense'], 40 | été: ['Copula'], 41 | fus: ['Copula', 'PastTense'], 42 | fut: ['Copula', 'PastTense'], 43 | fûmes: ['Copula', 'PastTense'], 44 | fûtes: ['Copula', 'PastTense'], 45 | furent: ['Copula', 'PastTense'], 46 | fusse: ['Copula', 'PastTense'], 47 | fusses: ['Copula', 'PastTense'], 48 | fût: ['Copula', 'PastTense'], 49 | fussions: ['Copula', 'PastTense'], 50 | fussiez: ['Copula', 'PastTense'], 51 | fussent: ['Copula', 'PastTense'], 52 | serais: ['Copula', 'PresentTense'], 53 | serait: ['Copula', 'PresentTense'], 54 | serions: ['Copula', 'PresentTense'], 55 | seriez: ['Copula', 'PresentTense'], 56 | seraient: ['Copula', 'PresentTense'], 57 | sois: ['Copula', 'PresentTense'], 58 | soyons: ['Copula', 'PresentTense'], 59 | soyez: ['Copula', 'PresentTense'], 60 | être: ['Copula', 'PresentTense'], 61 | 62 | 63 | 64 | } -------------------------------------------------------------------------------- /src/03-three/numbers/format/toText.js: -------------------------------------------------------------------------------- 1 | import data from '../data.js' 2 | let ones = data.ones.reverse() 3 | let tens = data.tens.reverse() 4 | 5 | let multiples = [ 6 | [1e12, 'mille milliard'], 7 | [1e11, 'cent milliard'], 8 | [1e9, 'milliard'], 9 | [1e8, 'cent million'], 10 | [1e6, 'million'], 11 | [100000, 'cent mille'], 12 | [1000, 'mille'], 13 | [100, 'cent'], 14 | [1, 'one'], 15 | ] 16 | 17 | //turn number into an array of magnitudes, like [[5, million], [2, hundred]] 18 | const getMagnitudes = function (num) { 19 | let working = num 20 | let have = [] 21 | multiples.forEach(a => { 22 | if (num >= a[0]) { 23 | let howmany = Math.floor(working / a[0]) 24 | working -= howmany * a[0] 25 | if (howmany) { 26 | have.push({ 27 | unit: a[1], 28 | num: howmany, 29 | }) 30 | } 31 | } 32 | }) 33 | return have 34 | } 35 | 36 | const twoDigit = function (num) { 37 | let words = [] 38 | // 20-90 39 | for (let i = 0; i < tens.length; i += 1) { 40 | if (tens[i][0] <= num) { 41 | words.push(tens[i][1]) 42 | num -= tens[i][0] 43 | break 44 | } 45 | } 46 | if (num === 0) { 47 | return words 48 | } 49 | // 0-19 50 | for (let i = 0; i < ones.length; i += 1) { 51 | if (ones[i][0] <= num) { 52 | // 'et un' 53 | if (words.length && ones[i][1] === 'un') { 54 | words.push('et') 55 | } 56 | words.push(ones[i][1]) 57 | num -= ones[i][0] 58 | break 59 | } 60 | } 61 | return words 62 | } 63 | 64 | // turn a number like 80 into words like 'quatre vingt' 65 | const toText = function (num) { 66 | if (num === 0) { 67 | return ['zero'] 68 | } 69 | let words = [] 70 | if (num < 0) { 71 | words.push('moins') 72 | num = Math.abs(num) 73 | } 74 | // handle multiples 75 | let found = getMagnitudes(num) 76 | found.forEach(obj => { 77 | let res = twoDigit(obj.num) 78 | if (obj.num === 1 && obj.unit !== 'one') { 79 | // don't add reduntant 'un cent' 80 | } else { 81 | words = words.concat(res) 82 | } 83 | if (obj.unit !== 'one') { 84 | words.push(obj.unit) 85 | } 86 | }) 87 | return words 88 | } 89 | export default toText -------------------------------------------------------------------------------- /src/01-one/lexicon/compute/root.js: -------------------------------------------------------------------------------- 1 | const verbForm = function (term) { 2 | let want = [ 3 | 'FirstPerson', 4 | 'SecondPerson', 5 | 'ThirdPerson', 6 | 'FirstPersonPlural', 7 | 'SecondPersonPlural', 8 | 'ThirdPersonPlural', 9 | ] 10 | return want.find(tag => term.tags.has(tag)) 11 | } 12 | 13 | const root = function (view) { 14 | const transform = view.world.methods.two.transform 15 | view.docs.forEach(terms => { 16 | terms.forEach(term => { 17 | let str = term.implicit || term.normal || term.text 18 | // nouns -> singular masculine form 19 | if (term.tags.has('Noun') && !term.tags.has('Pronoun')) { 20 | let isPlural = term.tags.has('PluralNoun') 21 | // let isFemale = term.tags.has('FemaleNoun') 22 | if (isPlural) { 23 | term.root = transform.noun.fromPlural(str) 24 | } 25 | } 26 | // adjectives -> singular masculine form 27 | if (term.tags.has('Adjective')) { 28 | let isPlural = term.tags.has('PluralAdjective') 29 | let isFemale = term.tags.has('FemaleAdjective') 30 | if (isPlural && isFemale) { 31 | term.root = transform.adjective.fromFemalePlural(str) 32 | } else if (isFemale) { 33 | term.root = transform.adjective.fromFemale(str) 34 | } else if (isPlural) { 35 | term.root = transform.adjective.fromPlural(str) 36 | } 37 | } 38 | // verbs -> infinitive form 39 | if (term.tags.has('Verb')) { 40 | if (term.tags.has('PresentTense')) { 41 | let form = verbForm(term) 42 | term.root = transform.verb.fromPresentTense(str, form) 43 | } 44 | if (term.tags.has('FutureTense')) { 45 | let form = verbForm(term) 46 | term.root = transform.verb.fromFutureTense(str, form) 47 | } 48 | if (term.tags.has('Passive')) { 49 | let form = verbForm(term) 50 | term.root = transform.verb.fromPassive(str, form) 51 | } else if (term.tags.has('PastTense')) { 52 | let form = verbForm(term) 53 | term.root = transform.verb.fromPastParticiple(str, form) 54 | } 55 | // fromImperfectTense, fromPastParticiple 56 | } 57 | }) 58 | }) 59 | } 60 | export default root -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/index.js: -------------------------------------------------------------------------------- 1 | // 1st pass 2 | import checkRegex from './1st-pass/regex.js' 3 | import titleCase from './1st-pass/titlecase.js' 4 | import checkYear from './1st-pass/year.js' 5 | // 2nd pass 6 | import acronym from './2nd-pass/acronym.js' 7 | import neighbours from './2nd-pass/neighbours.js' 8 | import nounFallback from './2nd-pass/noun-fallback.js' 9 | import suffixCheck from './2nd-pass/suffix-lookup.js' 10 | // 3rd pass 11 | import nounGender from './3rd-pass/noun-gender.js' 12 | import nounPlurals from './3rd-pass/noun-plurals.js' 13 | import adjPlurals from './3rd-pass/adj-plurals.js' 14 | import adjGender from './3rd-pass/adj-gender.js' 15 | import verbTense from './3rd-pass/verb-tense.js' 16 | import verbForm from './3rd-pass/verb-form.js' 17 | import numberTypes from './3rd-pass/number-types.js' 18 | import fixContractions from './3rd-pass/fix-contractions.js' 19 | 20 | // these methods don't care about word-neighbours 21 | const firstPass = function (terms, world) { 22 | for (let i = 0; i < terms.length; i += 1) { 23 | // is it titlecased? 24 | let found = titleCase(terms, i, world) 25 | // try look-like rules 26 | found = found || checkRegex(terms, i, world) 27 | // turn '1993' into a year 28 | checkYear(terms, i, world) 29 | } 30 | } 31 | const secondPass = function (terms, world) { 32 | for (let i = 0; i < terms.length; i += 1) { 33 | let found = acronym(terms, i, world) 34 | found = found || suffixCheck(terms, i, world) 35 | found = found || neighbours(terms, i, world) 36 | found = found || nounFallback(terms, i, world) 37 | } 38 | } 39 | const thirdPass = function (terms, world) { 40 | for (let i = 0; i < terms.length; i += 1) { 41 | nounGender(terms, i, world) 42 | nounPlurals(terms, i, world) 43 | adjPlurals(terms, i, world) 44 | adjGender(terms, i, world) 45 | verbTense(terms, i, world) 46 | verbForm(terms, i, world) 47 | numberTypes(terms, i, world) 48 | } 49 | // (4th pass) 50 | for (let i = 0; i < terms.length; i += 1) { 51 | fixContractions(terms, i, world) 52 | } 53 | } 54 | 55 | 56 | const tagger = function (view) { 57 | let world = view.world 58 | view.docs.forEach(terms => { 59 | firstPass(terms, world) 60 | secondPass(terms, world) 61 | thirdPass(terms, world) 62 | }) 63 | return view 64 | } 65 | export default tagger -------------------------------------------------------------------------------- /types/misc.ts: -------------------------------------------------------------------------------- 1 | export type Document = Term[][] 2 | 3 | export type Pointer = [n?: number, start?: number, end?: number, startId?: string, endId?: string] 4 | 5 | export type outMethods = 'text' | 'normal' | 'offset' | 'terms' | 'topk' | 'json' | 'tags' | 'array' | 'debug' 6 | 7 | export type Groups = object 8 | 9 | export interface Term { 10 | text: string, 11 | pre: string, 12 | post: string, 13 | normal: string, 14 | 15 | // in /two 16 | tags?: Set, 17 | index?: [n?: number, start?: number], 18 | id?: string, 19 | chunk?: string, 20 | dirty?: boolean 21 | 22 | // other things you may find... 23 | syllables?: string[], 24 | } 25 | 26 | // possible values to .json() 27 | export interface JsonProps { 28 | /** a perfect copy of the input text */ 29 | text?: boolean 30 | /** normalized whitespace, case, unicode, punctuation */ 31 | normal?: boolean 32 | /** lowercase, trimmed, contractions expanded. */ 33 | reduced?: boolean 34 | /** cleanup whitespace */ 35 | trim?: boolean 36 | /** character-position where this begins */ 37 | offset?: boolean 38 | /** frequency of this match in the document */ 39 | count?: boolean 40 | /** remove duplicate results*/ 41 | unique?: boolean 42 | /** starting term # in document */ 43 | index?: boolean 44 | /** options for each term */ 45 | terms?: { 46 | text?: boolean 47 | normal?: boolean 48 | clean?: boolean 49 | implicit?: boolean 50 | tags?: boolean 51 | whitespace?: boolean 52 | id?: boolean 53 | offset?: boolean 54 | bestTag?: boolean 55 | } 56 | } 57 | 58 | // a key-value object of words, terms 59 | export interface Lexicon { 60 | [key: string]: string 61 | } 62 | 63 | export interface Plugin { 64 | methods?: object, 65 | model?: object, 66 | compute?: object, 67 | hooks?: string[], 68 | tags?: object, 69 | words?: object, 70 | lib?: () => object, 71 | api?: (fn: (view: any) => {}) => void, //should be View 72 | mutate?: (fn: (world: object) => {}) => void, 73 | } 74 | 75 | export interface matchOptions { 76 | fuzzy?: number, 77 | caseSensitive?: boolean, 78 | } 79 | 80 | export interface Match { 81 | match: string, 82 | tag?: string | string[], 83 | unTag?: string | string[], 84 | group?: string | number, 85 | reason?: string, 86 | } 87 | 88 | export interface Net { 89 | hooks: object, 90 | always?: any, 91 | isNet: boolean 92 | } -------------------------------------------------------------------------------- /src/02-two/tagset/tags/nouns.js: -------------------------------------------------------------------------------- 1 | const entity = ['Person', 'Place', 'Organization'] 2 | 3 | export default { 4 | Noun: { 5 | not: ['Verb', 'Adjective', 'Adverb', 'Value', 'Determiner'], 6 | }, 7 | Singular: { 8 | is: 'Noun', 9 | not: ['PluralNoun'], 10 | }, 11 | ProperNoun: { 12 | is: 'Noun', 13 | }, 14 | Person: { 15 | is: 'Singular', 16 | also: ['ProperNoun'], 17 | not: ['Place', 'Organization', 'Date'], 18 | }, 19 | FirstName: { 20 | is: 'Person', 21 | }, 22 | MaleName: { 23 | is: 'FirstName', 24 | not: ['FemaleName', 'LastName'], 25 | }, 26 | FemaleName: { 27 | is: 'FirstName', 28 | not: ['MaleName', 'LastName'], 29 | }, 30 | LastName: { 31 | is: 'Person', 32 | not: ['FirstName'], 33 | }, 34 | Honorific: { 35 | is: 'Noun', 36 | not: ['FirstName', 'LastName', 'Value'], 37 | }, 38 | Place: { 39 | is: 'Singular', 40 | not: ['Person', 'Organization'], 41 | }, 42 | Country: { 43 | is: 'Place', 44 | also: ['ProperNoun'], 45 | not: ['City'], 46 | }, 47 | City: { 48 | is: 'Place', 49 | also: ['ProperNoun'], 50 | not: ['Country'], 51 | }, 52 | Region: { 53 | is: 'Place', 54 | also: ['ProperNoun'], 55 | }, 56 | Address: { 57 | // is: 'Place', 58 | }, 59 | Organization: { 60 | is: 'ProperNoun', 61 | not: ['Person', 'Place'], 62 | }, 63 | SportsTeam: { 64 | is: 'Organization', 65 | }, 66 | School: { 67 | is: 'Organization', 68 | }, 69 | Company: { 70 | is: 'Organization', 71 | }, 72 | PluralNoun: { 73 | is: 'Noun', 74 | not: ['Singular'], 75 | }, 76 | Uncountable: { 77 | is: 'Noun', 78 | }, 79 | Pronoun: { 80 | is: 'Noun', 81 | not: entity, 82 | }, 83 | Actor: { 84 | is: 'Noun', 85 | not: entity, 86 | }, 87 | Activity: { 88 | is: 'Noun', 89 | not: ['Person', 'Place'], 90 | }, 91 | Unit: { 92 | is: 'Noun', 93 | not: entity, 94 | }, 95 | Demonym: { 96 | is: 'Noun', 97 | also: ['ProperNoun'], 98 | not: entity, 99 | }, 100 | Possessive: { 101 | is: 'Noun', 102 | }, 103 | // german genders 104 | MaleNoun: { 105 | is: 'Noun', 106 | not: ['FemaleNoun'], 107 | }, 108 | FemaleNoun: { 109 | is: 'Noun', 110 | not: ['MaleNoun'], 111 | }, 112 | } 113 | -------------------------------------------------------------------------------- /data/lexicon/misc/adverbs.js: -------------------------------------------------------------------------------- 1 | // all '-ment' words are tagged by suffix 2 | export default [ 3 | 'pas', 4 | // 'plus', 5 | 'ainsi', 6 | 'lors', 7 | 'alors', 8 | 'aussi', 9 | 'donc', 10 | 'tres', 11 | 'très', 12 | 'deja', 13 | 'encore', 14 | // 'tout', 15 | 'bien', 16 | // 'moins', 17 | 'non', 18 | // 'hier', 19 | "jusqu'", 20 | 'meme', 21 | // 'peu', 22 | 'toujours', 23 | 'cependant', 24 | 'ailleurs', 25 | 'toutefois', 26 | // 'ici', 27 | 'environ', 28 | 'quant', 29 | 'que', 30 | 'tandis', 31 | 'beaucoup', 32 | 'outre', 33 | 'qu', 34 | 'ensuite', 35 | 'tant', 36 | 'jamais', 37 | 'enfin', 38 | 'tard', 39 | 'desormais', 40 | // 'maintenant', 41 | 'trop', 42 | 'autant', 43 | 'loin', 44 | 'pourtant', 45 | 'surtout', 46 | 'autour', 47 | 'auparavant', 48 | 'neanmoins', 49 | 'assez', 50 | 'tot', 51 | 'mieux', 52 | 'souvent', 53 | 'plutot', 54 | 'demain', 55 | 'pres', 56 | 'longtemps', 57 | 'presque', 58 | 'peut-etre', 59 | // 'mal', 60 | 'avant', 61 | 'partout', 62 | 'davantage', 63 | 'juste', 64 | 'vite', 65 | 'puis', 66 | 'parfois', 67 | 'guere', 68 | 'au dela', 69 | 'oui', 70 | 'au dessus', 71 | 'ores', 72 | // 'dehors', 73 | 'si', 74 | 'ci', 75 | 'bientot', 76 | // 'ensemble', 77 | 'apres', 78 | 'depuis', 79 | 'quand', 80 | 'quelque', 81 | 'aussitôt', 82 | 'quasi', 83 | // 'fort', 84 | 'vis a vis', 85 | 'dessous', 86 | 'voire', 87 | 'certes', 88 | 'jusque la', 89 | 'ci dessus', 90 | // 'matin', 91 | 'ci dessous', 92 | 'contre', 93 | 'autrefois', 94 | 'combien', 95 | 'comme', 96 | 'sous', 97 | 'inter', 98 | 'la bas', 99 | 'dorenavant', 100 | 'dessus', 101 | 'sans', 102 | 'alias', 103 | 'bel', 104 | 'jadis', 105 | // 'rien', 106 | 'etc', 107 | 'soit', 108 | 'entre temps', 109 | 'avant hier', 110 | "presqu'", 111 | // 'point', 112 | 'la dessus', 113 | 'mais', 114 | 'debout', 115 | 'ultra', 116 | 'bref', 117 | 'naguere', 118 | 'la-dedans', 119 | 'deca', 120 | 'ca', 121 | 'soi-disant', 122 | 'devant', 123 | 'fi', 124 | 'dedans', 125 | 'deja', 126 | 'idem', 127 | 'sic', 128 | 'sitot', 129 | 'derriere', 130 | 'haut', 131 | 'outre mer', 132 | 'crescendo', 133 | 'pourquoi', 134 | 'primo', 135 | 'secundo', 136 | 'tertio', 137 | 'quelqu', 138 | 'ferme', 139 | 'au-dessous', 140 | 'pele mele', 141 | 'sident', 142 | ] 143 | -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/2nd-pass/acronym.js: -------------------------------------------------------------------------------- 1 | const oneLetterAcronym = /^[A-Z]('s|,)?$/ 2 | const isUpperCase = /^[A-Z-]+$/ 3 | const periodAcronym = /([A-Z]\.)+[A-Z]?,?$/ 4 | const noPeriodAcronym = /[A-Z]{2,}('s|,)?$/ 5 | const lowerCaseAcronym = /([a-z]\.)+[a-z]\.?$/ 6 | 7 | const oneLetterWord = { 8 | I: true, 9 | A: true, 10 | } 11 | // just uppercase acronyms, no periods - 'UNOCHA' 12 | const isNoPeriodAcronym = function (term, model) { 13 | let str = term.text 14 | // ensure it's all upper-case 15 | if (isUpperCase.test(str) === false) { 16 | return false 17 | } 18 | // long capitalized words are not usually either 19 | if (str.length > 5) { 20 | return false 21 | } 22 | // 'I' is not a acronym 23 | if (oneLetterWord.hasOwnProperty(str)) { 24 | return false 25 | } 26 | // known-words, like 'PIZZA' is not an acronym. 27 | if (model.one.lexicon.hasOwnProperty(term.normal)) { 28 | return false 29 | } 30 | //like N.D.A 31 | if (periodAcronym.test(str) === true) { 32 | return true 33 | } 34 | //like c.e.o 35 | if (lowerCaseAcronym.test(str) === true) { 36 | return true 37 | } 38 | //like 'F.' 39 | if (oneLetterAcronym.test(str) === true) { 40 | return true 41 | } 42 | //like NDA 43 | if (noPeriodAcronym.test(str) === true) { 44 | return true 45 | } 46 | return false 47 | } 48 | 49 | const isAcronym = function (terms, i, world) { 50 | let setTag = world.methods.one.setTag 51 | let term = terms[i] 52 | //these are not acronyms 53 | if (term.tags.has('RomanNumeral') || term.tags.has('Acronym')) { 54 | return null 55 | } 56 | //non-period ones are harder 57 | if (isNoPeriodAcronym(term, world.model)) { 58 | term.tags.clear() 59 | setTag([term], ['Acronym', 'Noun'], world, false, '3-no-period-acronym') 60 | return true 61 | } 62 | // one-letter acronyms 63 | if (!oneLetterWord.hasOwnProperty(term.text) && oneLetterAcronym.test(term.text)) { 64 | term.tags.clear() 65 | setTag([term], ['Acronym', 'Noun'], world, false, '3-one-letter-acronym') 66 | return true 67 | } 68 | //if it's a very-short organization? 69 | if (term.tags.has('Organization') && term.text.length <= 3) { 70 | setTag([term], 'Acronym', world, false, '3-org-acronym') 71 | return true 72 | } 73 | // upper-case org, like UNESCO 74 | if (term.tags.has('Organization') && isUpperCase.test(term.text) && term.text.length <= 6) { 75 | setTag([term], 'Acronym', world, false, '3-titlecase-acronym') 76 | return true 77 | } 78 | return null 79 | } 80 | export default isAcronym 81 | -------------------------------------------------------------------------------- /src/02-two/preTagger/model/regex/regex-numbers.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | 3 | [/^[012]?[0-9]h$/i, 'Time', '04h'], 4 | [/^[012]?[0-9]h[0-9]{2}$/i, 'Time', '23h30'], 5 | [/^'[0-9]{2}$/, 'Year'], 6 | // times 7 | [/^[012]?[0-9](:[0-5][0-9])(:[0-5][0-9])$/, 'Time', '3:12:31'], 8 | [/^[012]?[0-9](:[0-5][0-9])?(:[0-5][0-9])$/, 'Time', '1:12'], 9 | 10 | // iso-dates 11 | [/^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}/i, 'Date', 'iso-date'], 12 | [/^[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,4}$/, 'Date', 'iso-dash'], 13 | [/^[0-9]{1,4}\/[0-9]{1,2}\/[0-9]{1,4}$/, 'Date', 'iso-slash'], 14 | [/^[0-9]{1,4}\.[0-9]{1,2}\.[0-9]{1,4}$/, 'Date', 'iso-dot'], 15 | [/^[0-9]{1,4}-[a-z]{2,9}-[0-9]{1,4}$/i, 'Date', '12-dec-2019'], 16 | 17 | // timezones 18 | [/^utc ?[+-]?[0-9]+$/, 'Timezone', 'utc-9'], 19 | [/^(gmt|utc)[+-][0-9]{1,2}$/i, 'Timezone', 'gmt-3'], 20 | 21 | //phone numbers 22 | [/^[0-9]{3}-[0-9]{4}$/, 'PhoneNumber', '421-0029'], 23 | [/^(\+?[0-9][ -])?[0-9]{3}[ -]?[0-9]{3}-[0-9]{4}$/, 'PhoneNumber', '1-800-'], 24 | 25 | 26 | //money 27 | //like $5.30 28 | [ 29 | /^[-+]?[$\xA2-\xA5\u058F\u060B\u09F2\u09F3\u09FB\u0AF1\u0BF9\u0E3F\u17DB\u20A0-\u20BD\uA838\uFDFC\uFE69\uFF04\uFFE0\uFFE1\uFFE5\uFFE6][-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?([kmb]|bn)?\+?$/, 30 | ['Money', 'Value'], 31 | '$5.30', 32 | ], 33 | //like 5.30$ 34 | [ 35 | /^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?[$\xA2-\xA5\u058F\u060B\u09F2\u09F3\u09FB\u0AF1\u0BF9\u0E3F\u17DB\u20A0-\u20BD\uA838\uFDFC\uFE69\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]\+?$/, 36 | ['Money', 'Value'], 37 | '5.30£', 38 | ], 39 | //like 40 | [/^[-+]?[$£]?[0-9]([0-9,.])+(usd|eur|jpy|gbp|cad|aud|chf|cny|hkd|nzd|kr|rub)$/i, ['Money', 'Value'], '$400usd'], 41 | 42 | //numbers 43 | // 50 | -50 | 3.23 | 5,999.0 | 10+ 44 | [/^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?\+?$/, ['Cardinal', 'NumericValue'], '5,999'], 45 | [/^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?(e|er)$/, ['Ordinal', 'NumericValue'], '53rd'], 46 | // .73th 47 | [/^\.[0-9]+\+?$/, ['Cardinal', 'NumericValue'], '.73th'], 48 | //percent 49 | [/^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?%\+?$/, ['Percent', 'Cardinal', 'NumericValue'], '-4%'], 50 | [/^\.[0-9]+%$/, ['Percent', 'Cardinal', 'NumericValue'], '.3%'], 51 | //fraction 52 | [/^[0-9]{1,4}\/[0-9]{1,4}(e|er)?s?$/, ['Fraction', 'NumericValue'], '2/3rds'], 53 | //range 54 | [/^[0-9.]{1,3}[a-z]{0,2}[-–—][0-9]{1,3}[a-z]{0,2}$/, ['Value', 'NumberRange'], '3-4'], 55 | //time-range 56 | [/^[0-9]{1,2}(:[0-9][0-9])?(am|pm)? ?[-–—] ?[0-9]{1,2}(:[0-9][0-9])?(am|pm)$/, ['Time', 'NumberRange'], '3-4pm'], 57 | //with unit 58 | [/^[0-9.]+([a-z]{1,4})$/, 'Value', '9km'], 59 | ] -------------------------------------------------------------------------------- /src/01-one/lexicon/model/lexicon.js: -------------------------------------------------------------------------------- 1 | import lexData from './_data.js' 2 | import { unpack } from 'efrt' 3 | import transform from '../methods/index.js' 4 | import misc from './misc.js' 5 | 6 | const tagMap = { 7 | first: 'FirstPerson', 8 | second: 'SecondPerson', 9 | third: 'ThirdPerson', 10 | firstPlural: 'FirstPersonPlural', 11 | secondPlural: 'SecondPersonPlural', 12 | thirdPlural: 'ThirdPersonPlural', 13 | } 14 | 15 | let words = {} 16 | Object.keys(lexData).forEach(tag => { 17 | let wordsObj = unpack(lexData[tag]) 18 | Object.keys(wordsObj).forEach(w => { 19 | words[w] = tag 20 | 21 | // expand 22 | if (tag === 'MaleAdjective') { 23 | let res = transform.adjective.conjugate(w) 24 | words[res.female] = words[res.female] || 'FemaleAdjective' 25 | words[res.plural] = words[res.plural] || 'MaleAdjective' 26 | words[res.femalePlural] = words[res.femalePlural] || 'FemaleAdjective' 27 | } 28 | if (tag === 'Cardinal') { 29 | words[w] = ['TextValue', 'Cardinal'] 30 | } 31 | if (tag === 'Noun' || tag === 'MaleNoun' || tag === 'FemaleNoun') { 32 | words[w] = [tag, 'Singular'] 33 | let plur = transform.noun.toPlural(w) 34 | words[plur] = words[plur] || ['Noun', 'Plural'] 35 | } 36 | if (tag === 'Ordinal') { 37 | words[w] = ['TextValue', 'Ordinal'] 38 | let norm = w.replace(/è/, 'e') 39 | words[norm] = words[norm] || ['TextValue', 'Ordinal'] 40 | } 41 | if (tag === 'MaleNoun') { 42 | let p = transform.noun.toPlural(w) 43 | words[p] = words[p] || 'PluralNoun' 44 | } 45 | if (tag === 'Infinitive') { 46 | // do future-tense 47 | let res = transform.verb.toFutureTense(w) 48 | Object.keys(res).forEach(k => { 49 | if (!words[res[k]]) { 50 | words[res[k]] = words[res[k]] || [tagMap[k], 'FutureTense'] 51 | } 52 | }) 53 | // do present-tense 54 | res = transform.verb.toPresentTense(w) 55 | Object.keys(res).forEach(k => { 56 | if (!words[res[k]]) { 57 | words[res[k]] = words[res[k]] || [tagMap[k], 'PresentTense'] 58 | } 59 | }) 60 | // do imperfect mood 61 | res = transform.verb.toImperfect(w) 62 | Object.keys(res).forEach(k => words[res[k]] = words[res[k]] || 'Verb') 63 | // past-participle 64 | let out = transform.verb.toPastParticiple(w) 65 | words[out] = words[out] || 'PastParticiple' 66 | } 67 | }) 68 | }) 69 | 70 | let lexicon = Object.assign({}, words, misc) 71 | // console.log(Object.keys(lexicon).length.toLocaleString(), 'words') 72 | // console.log(lexicon['ralentir']) 73 | export default lexicon -------------------------------------------------------------------------------- /tests/numbers/ordinal.test.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from '../_lib.js' 3 | let here = '[number ordinal] ' 4 | 5 | 6 | let arr = [ 7 | [0, 'zero', 'zeroième'], 8 | // [1, 'un', 'unième'], 9 | [2, 'deux', 'deuxième'], 10 | [3, 'trois', 'troisième'], 11 | [4, 'quatre', 'quatrième'], 12 | [5, 'cinq', 'cinquième'], 13 | [6, 'six', 'sixième'], 14 | [7, 'sept', 'septième'], 15 | [8, 'huit', 'huitième'], 16 | [9, 'neuf', 'neuvième'], 17 | 18 | [10, 'dix', 'dixième'], 19 | [11, 'onze', 'onzième'], 20 | [12, 'douze', 'douzième'], 21 | [13, 'treize', 'treizième'], 22 | [14, 'quatorze', 'quatorzième'], 23 | [15, 'quinze', 'quinzième'], 24 | [16, 'seize', 'seizième'], 25 | [17, 'dix sept', 'dix septième'], 26 | [18, 'dix huit', 'dix huitième'], 27 | [19, 'dix neuf', 'dix neuvième'], 28 | 29 | [20, 'vingt', 'vingtième'], 30 | [30, 'trente', 'trentième'], 31 | [40, 'quarante', 'quarantième'], 32 | [50, 'cinquante', 'cinquantième'], 33 | [60, 'soixante', 'soixantième'], 34 | [70, 'soixante dix', 'soixante dixième'], 35 | [80, 'quatre vingt', 'quatre vingtième'], 36 | [90, 'quatre vingt dix huit', 'quatre vingt dix huitième'], 37 | 38 | [100, 'cent', 'centième'], 39 | [1000, 'mille', 'millième'], 40 | [1000000, 'million', 'millionième'],//million 1000,000 41 | [1000000000, 'milliard', 'milliardième'],//billion 1000,000,000 42 | // [1000000000000, 'mille milliards', 'mille milliardième'],//trillion 1000,000,000 43 | 44 | ] 45 | test('cardinal to ordinal:', function (t) { 46 | arr.forEach(function (a) { 47 | let [_, card, ord] = a 48 | let doc = nlp(card).numbers().toOrdinal() 49 | t.equal(doc.text(), ord, here + ' [toOrdinal] ' + card) 50 | }) 51 | t.end() 52 | }) 53 | test('ordinal -> cardinal:', function (t) { 54 | arr.forEach(function (a) { 55 | let [, card, ord] = a 56 | let doc = nlp(ord).numbers().toCardinal() 57 | t.equal(doc.text(), card, here + ' [toCardinal] ' + card) 58 | }) 59 | t.end() 60 | }) 61 | 62 | 63 | test('ordinal fmt:', function (t) { 64 | let list = [ 65 | // [1, 'première', '1er'],//'first' 66 | [2, 'deuxième', '2e'],//'second' 67 | [3, 'troisième', '3e'],//'third' 68 | [4, 'quatrième', '4e'],//'fourth' 69 | [5, 'cinquième', '5e'],//'fifth' 70 | [6, 'sixième', '6e'],//'sixth' 71 | [7, 'septième', '7e'],//'seventh' 72 | [8, 'huitième', '8e'],//'eighth' 73 | [9, 'neuvième', '9e'],//'ninth' 74 | [10, 'dixième', '10e'],//'tenth' 75 | ] 76 | list.forEach(function (a) { 77 | let [_, str, want] = a 78 | let m = nlp(str).numbers().toNumber() 79 | t.equal(m.text(), want, here + str) 80 | }) 81 | t.end() 82 | }) 83 | -------------------------------------------------------------------------------- /tests/numbers/number-misc.test.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from '../_lib.js' 3 | let here = '[number-misc] ' 4 | 5 | 6 | test('num equals', function (t) { 7 | let arr = [ 8 | ['un cent', 'cent'], 9 | ['trois cents', 'trois cent'], 10 | ['un million', 'million'], 11 | ['3 cent', 'trois cent'], 12 | ['cinquante', 'cinquantième'], 13 | ['sept', 'septième'], 14 | ['dix huit', 'dix huitième'], 15 | ['moins dix huitième', '-18e'], 16 | ['moins dix huit', '-18'], 17 | ['moins deux centième', '-200'], 18 | ['quatorze cent', 'quatorze centième'] 19 | ] 20 | arr.forEach(a => { 21 | let [left, right] = a 22 | left = nlp(left).numbers().get()[0] 23 | right = nlp(right).numbers().get()[0] 24 | t.equal(left, right, here + a.join(' == ')) 25 | }) 26 | t.end() 27 | }) 28 | 29 | test('prefix/suffix:', function (t) { 30 | let doc = nlp('$7,938').numbers().add(1) 31 | t.equal(doc.text(), '$7939', here + 'add money') 32 | 33 | doc = nlp('7,938kg').numbers().minus(1) 34 | t.equal(doc.text(), '7937kg', here + 'minus w/ unit') 35 | 36 | doc = nlp('938.4cm').numbers().minus(1) 37 | t.equal(doc.text(), '937.4cm', here + 'minus w/ decimal') 38 | 39 | doc = nlp('33e').numbers().add(1) 40 | t.equal(doc.text(), '34e', here + 'add ordinal') 41 | t.end() 42 | }) 43 | 44 | // test('units-basic:', function (t) { 45 | // let arr = [ 46 | // // ['33km', 'km'], 47 | // ['33 km', 'km'], 48 | // ['40,000 mètres', 'mètres'], 49 | // ['1 pouce', 'pouce'], 50 | // ['2 pouces', 'pouces'], 51 | // ['seven hundred litres', 'litres'], 52 | // ['one litre', 'litre'], 53 | // ['0.4 mètre', 'meter'], 54 | // // ['3 km2', 'km2'], 55 | // ['3 km²', 'km²'], 56 | // // ['44 °c', '°c'], 57 | // ] 58 | // arr.forEach(a => { 59 | // let m = nlp(a[0]).numbers().units() 60 | // t.equal(m.out('normal'), a[1], here + a[0]) 61 | // }) 62 | // t.end() 63 | // }) 64 | 65 | 66 | test('plus:', function (t) { 67 | let doc = nlp(`j'ai quatre vingt deux pommes`) 68 | doc.numbers().add(2) 69 | t.equal(doc.text(), `j'ai quatre vingt quatre pommes`, here + 'plus-2') 70 | 71 | doc = nlp(`j'ai moins quarante pommes`) 72 | doc.numbers().add(50) 73 | t.equal(doc.text(), `j'ai dix pommes`, here + 'plus-50') 74 | t.end() 75 | }) 76 | 77 | test('minus:', function (t) { 78 | let doc = nlp(`j'ai quarante pommes`) 79 | doc.numbers().minus(50) 80 | t.equal(doc.text(), `j'ai moins dix pommes`, here + 'minus-50') 81 | 82 | doc = nlp(`j'ai moins quarante pommes`) 83 | doc.numbers().minus(50) 84 | t.equal(doc.text(), `j'ai moins quatre vingt dix pommes`, here + 'minus-50') 85 | t.end() 86 | }) -------------------------------------------------------------------------------- /src/02-two/preTagger/methods/guessGender.js: -------------------------------------------------------------------------------- 1 | let masc = new Set(['le', 'un', 'du']) 2 | let femme = new Set(['la', 'une']) 3 | 4 | const femaleEnds = ['anse', 'ette', 'esse', 'ance', 'eine', 'ure', 'ion'] 5 | const maleEnds = [ 6 | 'age', 'isme', 'eau', 'ment', 'in', 'ou', 'et', 'ege', 'eme', 'ome', 'aume', 'age', 'isme', 'an', 'ent', 'ai', 'out', 'et', 'eu', 'ut', 'is', 'il', 'ex', 7 | 'an', 'and', 'ant', 'ent', 'in', 'int', 'om', 'ond', 'ont', 'eau', 'au', 'aud', 'aut', 'o', 'os', 'ot', 'ai', 'ais', 'ait', 'es', 'et', 'ou', 'out', 'out', 'oux', 'i', 'il', 'it', 'is', 'y', 'at', 'as', 'ois', 'oit', 'u', 'us', 'ut', 8 | 'eu', 'er', 'cé', 'age', 'ege', 'ème', 'ome', 'aume', 'isme', 'as', 'is', 'os', 'us', 'ex', 'it', 'est', 'al', 'el', 'il', 'ol', 'eul', 'all', 'if', 'ef', 'ac', 'ic', 'oc', 'uc', 'am', 'um', 'en', 'air', 'er', 9 | 'erf', 'ert', 'ar', 'arc', 'ars', 'art', 'our', 'ours', 'or', 'ord', 'ors', 'ort', 'ir', 'oir', 'eur', 'ail', 'eil', 'euil', 'ueil', 'ing', 10 | ] 11 | 12 | 13 | const suffixGuess = function (term) { 14 | let str = term.normal 15 | str = str.replace(/s$/, '') 16 | if (femaleEnds.find(suff => str.endsWith(suff))) { 17 | return 'FemaleNoun' 18 | } 19 | if (maleEnds.find(suff => str.endsWith(suff))) { 20 | return 'MaleNoun' 21 | } 22 | return null 23 | } 24 | 25 | const fallback = function (term) { 26 | let str = term.normal 27 | if (str.endsWith('e') || str.endsWith('es')) { 28 | return 'FemaleNoun' 29 | } 30 | return null //-? 31 | } 32 | 33 | const lookLeft = function (terms, i) { 34 | for (let n = 1; n < 3; n += 1) { 35 | if (!terms[i - n]) { 36 | return null 37 | } 38 | let term = terms[i - n] 39 | if (masc.has(term.normal)) { 40 | return 'MaleNoun' 41 | } 42 | if (femme.has(term.normal)) { 43 | return 'FemaleNoun' 44 | } 45 | } 46 | return null 47 | } 48 | 49 | // look for a gendered adjective 50 | const lookRight = function (terms, i) { 51 | for (let n = 1; n < 2; n += 1) { 52 | if (!terms[i + n]) { 53 | return null 54 | } 55 | let term = terms[i + n] 56 | if (term.tags.has('MaleAdjective')) { 57 | return 'MaleNoun' 58 | } 59 | if (term.tags.has('FemaleAdjective')) { 60 | return 'FemaleNoun' 61 | } 62 | } 63 | return null 64 | } 65 | 66 | const guessGender = function (terms, i) { 67 | let { tags } = terms[i] 68 | if (!tags.has('Noun')) { 69 | return null 70 | } 71 | if (tags.has('MaleNoun')) { 72 | return 'MaleNoun' 73 | } 74 | if (tags.has('FemaleNoun')) { 75 | return 'FemaleNoun' 76 | } 77 | let found = lookLeft(terms, i) 78 | found = found || lookRight(terms, i) 79 | found = found || suffixGuess(terms[i]) 80 | found = found || fallback(terms[i]) 81 | return found 82 | } 83 | export default guessGender -------------------------------------------------------------------------------- /plugins/dates/tests/backburner/equals.ignore.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from './_lib.js' 3 | 4 | const context = { 5 | today: '2020-01-21', 6 | timezone: 'Canada/Pacific', 7 | } 8 | 9 | const arr = [ 10 | // explicit-dates 11 | [`march 2nd`, '2020-03-02T00:00:00.000-08:00'], 12 | [`2 march`, '2020-03-02T00:00:00.000-08:00'], 13 | [`tues march 2`, '2020-03-02T00:00:00.000-08:00'], 14 | [`march the second`, '2020-03-02T00:00:00.000-08:00'], 15 | [`on the 2nd of march`, '2020-03-02T00:00:00.000-08:00'], 16 | 17 | // numerical-dates 18 | [`1999/03/02`, 'march 2 1999'], 19 | [`1999-03-02`, 'march 2 1999'], 20 | [`03-02-1999`, 'march 2nd 1999'], 21 | [`03/02`, 'march 2'], 22 | [`2015.08.13`, 'aug 13 2015'], 23 | 24 | // named-dates 25 | [`today`, '2020-01-21'], 26 | [`now`, 'right now'], 27 | [`q1`, 'jan 1'], 28 | [`tomorrow`, '2020-01-22'], 29 | 30 | // time 31 | [`2pm`, '2020-01-21T14:00:00.000-08:00'], 32 | [`2:12pm`, '2020-01-21T14:12:00.000-08:00'], 33 | [`2pm eastern time`, '2020-01-21T14:00:00.000-05:00'], 34 | [`2:12 in the evening`, '2020-01-21T14:12:00.000-08:00'], 35 | [`02:12:00am`, '2020-01-21T02:12:00.000-08:00'], 36 | [`2 oclock am`, '2020-01-21T02:00:00.000-08:00'], 37 | [`noon`, 'today at 12pm'], 38 | [`at night`, 'today at 8:00pm'], 39 | [`in the morning`, 'tomorrow at 8:00pm'], 40 | [`tomorrow evening`, 'Jan 22 6pm'], 41 | [`aug-20`, '20-aug'], 42 | [`in a few years`, `in 3 years`], 43 | [`in a couple years`, `in 2 years`], 44 | [`2 weeks back`, `2 weeks ago`], 45 | [`last q1`, `q1 2019`], 46 | [`last q2`, `q2 2019`], 47 | [`last q3`, `q3 2019`], 48 | [`last q4`, `q4 2019`], 49 | [`this q1`, `q1 2020`], 50 | [`this q2`, `q2 2020`], 51 | [`this q3`, `q3 2020`], 52 | [`this q4`, `q4 2020`], 53 | [`next q1`, `q1 2021`], 54 | [`next q2`, `q2 2021`], 55 | [`next q3`, `q3 2021`], 56 | [`next q4`, `q4 2021`], 57 | [`tuesday at 3`, `tuesday 3:00pm`], 58 | [`tuesday at 4:00`, `tuesday 4:00pm`], 59 | [`5:30`, `today at 5:30pm`], 60 | [`tuesday at 3am`, `tuesday 3:00am`], 61 | [`5 oclock`, `today at 5:00pm`], 62 | [`5 oclock am`, `today at 5:00am`], 63 | [`10 oclock`, `today at 10:00am`], 64 | [`11:30`, `today at 11:30am`], 65 | [`11:30pm`, `today at 11:30pm`], 66 | [`tuesday at 1`, `tuesday at 1pm`], 67 | ['this fri, monday', 'fri jan 24 and mon jan 27'], 68 | ['next friday, this monday', 'fri jan 31 and mon jan 27'], 69 | ] 70 | 71 | test('date-variety', function (t) { 72 | arr.forEach((a) => { 73 | let left = nlp(a[0]).dates(context).json()[0] || {} 74 | let right = nlp(a[1]).dates(context).json()[0] || {} 75 | left.date = left.date || {} 76 | right.date = right.date || {} 77 | t.equal(left.date.start, right.date.start, a[0]) 78 | }) 79 | t.end() 80 | }) 81 | -------------------------------------------------------------------------------- /src/02-two/tagset/tags/verbs.js: -------------------------------------------------------------------------------- 1 | export default { 2 | Verb: { 3 | not: ['Noun', 'Adjective', 'Adverb', 'Value', 'Expression'], 4 | }, 5 | PresentTense: { 6 | is: 'Verb', 7 | not: ['PastTense'], 8 | }, 9 | Infinitive: { 10 | is: 'PresentTense', 11 | not: ['Gerund'], 12 | }, 13 | Imperative: { 14 | is: 'Infinitive', 15 | }, 16 | Gerund: { 17 | is: 'PresentTense', 18 | not: ['Copula'], 19 | }, 20 | PastTense: { 21 | is: 'Verb', 22 | not: ['PresentTense', 'Gerund'], 23 | }, 24 | Copula: { 25 | is: 'Verb', 26 | }, 27 | Modal: { 28 | is: 'Verb', 29 | not: ['Infinitive'], 30 | }, 31 | PerfectTense: { 32 | is: 'Verb', 33 | not: ['Gerund'], 34 | }, 35 | Pluperfect: { 36 | is: 'Verb', 37 | }, 38 | Participle: { 39 | is: 'PastTense', 40 | }, 41 | PhrasalVerb: { 42 | is: 'Verb', 43 | }, 44 | Passive: { 45 | is: 'PastTense', 46 | }, 47 | Particle: { 48 | is: 'PhrasalVerb', 49 | not: ['PastTense', 'PresentTense', 'Copula', 'Gerund'], 50 | }, 51 | Auxiliary: { 52 | is: 'Verb', 53 | not: ['PastTense', 'PresentTense', 'Gerund', 'Conjunction'], 54 | }, 55 | 56 | // french verb forms 57 | PresentParticiple: { 58 | is: 'PresentTense', 59 | not: ['PastTense', 'FutureTense'], 60 | }, 61 | PastParticiple: { 62 | is: 'PastTense', 63 | not: ['PresentTense', 'FutureTense'], 64 | }, 65 | // [only formal] parlai, parlâmes 66 | PastSimple: { 67 | is: 'PastTense', 68 | not: ['PresentTense', 'FutureTense'], 69 | }, 70 | ConditionalVerb: { 71 | is: 'Verb', 72 | }, 73 | FutureTense: { 74 | is: 'Verb', 75 | not: ['PresentTense', 'PastTense', 'Gerund'], 76 | }, 77 | 78 | // 79 | FirstPerson: { 80 | is: 'Verb', 81 | not: ['SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural'] 82 | }, 83 | SecondPerson: { 84 | is: 'Verb', 85 | not: ['FirstPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural'] 86 | }, 87 | ThirdPerson: { 88 | is: 'Verb', 89 | not: ['FirstPerson', 'SecondPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural'] 90 | }, 91 | FirstPersonPlural: { 92 | is: 'Verb', 93 | not: ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'SecondPersonPlural', 'ThirdPersonPlural'] 94 | }, 95 | SecondPersonPlural: { 96 | is: 'Verb', 97 | not: ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'ThirdPersonPlural'] 98 | }, 99 | ThirdPersonPlural: { 100 | is: 'Verb', 101 | not: ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural'] 102 | }, 103 | } 104 | -------------------------------------------------------------------------------- /learn/giga/corpus.js: -------------------------------------------------------------------------------- 1 | import { forEachSync } from './_giga.js' 2 | import doSentences from './french.js' 3 | import fs from 'fs' 4 | 5 | 6 | let ids = [] 7 | for (let i = 1; i <= 10; i += 1) { 8 | let str = String(i).padStart(4, '0') 9 | ids.push(str) 10 | } 11 | // ids = ['0004'] 12 | 13 | let tagMap = { 14 | 'ABR': 'Abbreviation',//abbreviation 15 | 'ADJ': 'Adjective',//adjective 16 | 'ADV': 'Adverb',//adjective 17 | 'DET:ART': 'Determiner',//article 18 | 'DET:POS': 'Pronoun',//possessive pronoun (ma, ta, ...) 19 | 'INT': 'Interjection',//interjection 20 | 'KON': 'Conjunction',//conjunction 21 | 'NAM': 'ProperNoun',//proper name 22 | 'NOM': 'Noun',//noun 23 | 'NUM': 'Value',//numeral 24 | 'PRO': 'Pronoun',//pronoun 25 | 'PRO:DEM': 'Pronoun',//demonstrative pronoun 26 | 'PRO:IND': 'Pronoun',//indefinite pronoun 27 | 'PRO:PER': 'Pronoun',//personal pronoun 28 | 'PRO:POS': 'Pronoun',//possessive pronoun (mien, tien, ...) 29 | 'PRO:REL': 'Pronoun',//relative pronoun 30 | 'PRP': 'Preposition',//preposition 31 | 'PRP:det': 'Preposition',//preposition plus article (au,du,aux,des) 32 | // 'PUN':'',//punctuation 33 | // 'PUN:cit':'',//punctuation citation 34 | // 'SENT':'',//sentence tag 35 | // 'SYM':'',//symbol 36 | 'VER:cond': 'Verb',//verb conditional 37 | 'VER:futu': 'Verb',//verb futur 38 | 'VER:impe': 'Verb',//verb imperative 39 | 'VER:impf': 'Verb',//verb imperfect 40 | 'VER:infi': 'Verb',//verb infinitive 41 | 'VER:pper': 'Verb',//verb past participle 42 | 'VER:ppre': 'Verb',//verb present participle 43 | 'VER:pres': 'Verb',//verb present 44 | 'VER:simp': 'Verb',//verb simple past 45 | 'VER:subi': 'Verb',//verb subjunctive imperfect 46 | 'VER:subp': 'Verb',//verb subjunctive present 47 | } 48 | 49 | let byTag = { 50 | Verb: {}, 51 | Noun: {}, 52 | Adjective: {}, 53 | Adverb: {}, 54 | } 55 | const doBoth = function (both) { 56 | both.fr.forEach((term, i) => { 57 | let tag = tagMap[term['$'].pos] 58 | let str = term['$text'].toLowerCase() 59 | if (tag && byTag[tag]) { 60 | byTag[tag][str] = byTag[tag][str] || 0 61 | byTag[tag][str] += 1 62 | } 63 | }) 64 | } 65 | await forEachSync(ids, async id => { 66 | try { 67 | console.log(`\ndoing ${id}:\n`) 68 | await doSentences(id, doBoth) 69 | } catch (e) { 70 | console.log(e) 71 | } 72 | }) 73 | 74 | const doTag = function (tag, max = 6) { 75 | let all = Object.entries(byTag[tag]) 76 | all = all.filter(a => a[1] > max) 77 | all = all.sort((a, b) => { 78 | if (a[1] > b[1]) { 79 | return -1 80 | } else if (a[1] < b[1]) { 81 | return 1 82 | } 83 | return 0 84 | }) 85 | all = all.map(a => a[0]) 86 | fs.writeFileSync(`./${tag}.js`, 'export default ' + JSON.stringify(all, null, 2)) 87 | return all 88 | } 89 | doTag('Adverb') 90 | doTag('Verb') 91 | doTag('Noun') 92 | doTag('Adjective') 93 | // console.dir(byTag, { depth: 5 }) -------------------------------------------------------------------------------- /plugins/dates/tests/backburner/to-iso.ignore.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from './_lib.js' 3 | 4 | const context = { 5 | today: '2019-02-02T03:40:00.000Z', 6 | timezone: 'UTC', 7 | } 8 | 9 | let arr = [ 10 | ['june 5th 1999', '1999-06-05T00:00:00.000Z'], 11 | ['june 5th 1999', '1999-06-05T00:00:00.000Z'], 12 | ['january 1st 1644', '1644-01-01T00:00:00.000Z'], 13 | ['jan 1st 1644', '1644-01-01T00:00:00.000Z'], 14 | ['June 4th 1993', '1993-06-04T00:00:00.000Z'], 15 | ['March 1st 1987', '1987-03-01T00:00:00.000Z'], 16 | ['June 22nd 2014', '2014-06-22T00:00:00.000Z'], 17 | ['may 22nd 2014', '2014-05-22T00:00:00.000Z'], 18 | ['sep 22nd 2014', '2014-09-22T00:00:00.000Z'], 19 | ['apr 22nd 2014', '2014-04-22T00:00:00.000Z'], 20 | ['June 22nd 1997', '1997-06-22T00:00:00.000Z'], 21 | ['january 5th 1998', '1998-01-05T00:00:00.000Z'], 22 | ['3rd of March 1969', '1969-03-03T00:00:00.000Z'], 23 | ['2nd of April 1929', '1929-04-02T00:00:00.000Z'], 24 | ['2nd of jul 1929', '1929-07-02T00:00:00.000Z'], 25 | ['March 1969', '1969-03-01T00:00:00.000Z'], 26 | ['jan 1921', '1921-01-01T00:00:00.000Z'], 27 | ['March 18th', '2019-03-18T00:00:00.000Z'], 28 | ['August 28th', '2019-08-28T00:00:00.000Z'], 29 | ['18th of March', '2019-03-18T00:00:00.000Z'], 30 | ['27th of March', '2019-03-27T00:00:00.000Z'], 31 | ['february 10th', '2019-02-10T00:00:00.000Z'], 32 | ['february 28th', '2019-02-28T00:00:00.000Z'], 33 | ['first day of 2019', '2019-01-01T00:00:00.000Z'], 34 | ['last day of 2019', '2019-12-31T00:00:00.000Z'], 35 | ['7th hour of 2019', '2019-01-01T06:00:00.000Z'], 36 | ['7th day of 2019', '2019-01-07T00:00:00.000Z'], 37 | ['second quarter of 2019', '2019-04-01T00:00:00.000Z'], 38 | ['30th minute of 2019', '2019-01-01T00:30:00.000Z'], 39 | ['2019', '2019-01-01T00:00:00.000Z'], 40 | ['2028', '2028-01-01T00:00:00.000Z'], 41 | ['in 2028', '2028-01-01T00:00:00.000Z'], 42 | ['2nd month in 2028', '2028-02-01T00:00:00.000Z'], 43 | ['first day of march 2019', '2019-03-01T00:00:00.000Z'], 44 | ['5th day of march 2019', '2019-03-05T00:00:00.000Z'], 45 | ['5th day of q1 2002', '2002-01-05T00:00:00.000Z'], 46 | ['5th hour of March 3rd 2002', '2002-03-03T04:00:00.000Z'], 47 | ['last hour of March 2021', '2021-03-31T23:00:00.000Z'], 48 | ['may to august 1996', '1996-05-01T00:00:00.000Z'], 49 | ['half past 4', '2019-02-02T16:30:00.000Z'], 50 | ['20 past 2', '2019-02-02T14:20:00.000Z'], 51 | ['at 20 past', '2019-02-02T04:20:00.000Z'], 52 | ['at half past', '2019-02-02T04:30:00.000Z'], 53 | ['at quarter to', '2019-02-02T03:45:00.000Z'], 54 | ['at quarter after', '2019-02-02T04:15:00.000Z'], 55 | // ['august to may 1996', '1996-05-01T00:00:00.000Z'], 56 | ] 57 | 58 | test('date-parse :', function (t) { 59 | arr.forEach(function (a) { 60 | let json = nlp(a[0]).dates(context).json()[0] || {} 61 | t.equal(json.dates.start, a[1], a[0]) 62 | }) 63 | t.end() 64 | }) 65 | -------------------------------------------------------------------------------- /src/03-three/numbers/parse/fromText.js: -------------------------------------------------------------------------------- 1 | import { toCardinal, toNumber } from './_data.js' 2 | 3 | const multiLeft = { 4 | dix: true,//dix huit 5 | soixante: true,//soixante dix 6 | quatre: true,//quatre vingt 7 | mille: true//mille milliards 8 | } 9 | 10 | const multiples = { 11 | // cent: 100,//hundred 12 | mille: 1000,//thousand 13 | milles: 1000,//thousand 14 | million: 1000000,//million 15 | millions: 1000000,//million 16 | milliards: 1000000000//billion 17 | } 18 | 19 | // greedy scan for multi-word numbers, like 'quatre vingt' 20 | const scanAhead = function (terms, i) { 21 | let skip = 0 22 | let add = 0 23 | let words = [] 24 | for (let index = 0; index < 3; index += 1) { 25 | if (!terms[i + index]) { 26 | break 27 | } 28 | let w = terms[i + index].normal || '' 29 | if (toCardinal.hasOwnProperty(w)) { 30 | w = toCardinal[w] 31 | } 32 | words.push(w) 33 | let str = words.join(' ') 34 | if (toNumber.hasOwnProperty(str)) { 35 | skip = index 36 | add = toNumber[str] 37 | } 38 | } 39 | return { skip, add } 40 | } 41 | 42 | const parseNumbers = function (terms = []) { 43 | let sum = 0 44 | let carry = 0 45 | let minus = false 46 | let sums = [] 47 | for (let i = 0; i < terms.length; i += 1) { 48 | let { tags, normal } = terms[i] 49 | let w = normal || '' 50 | if (w === 'moins') { 51 | minus = true 52 | continue 53 | } 54 | // ... et-un 55 | if (w === 'et') { 56 | continue 57 | } 58 | // 'huitieme' 59 | if (tags.has('Ordinal')) { 60 | w = toCardinal[w] 61 | } 62 | // add thousand, million 63 | if (multiples.hasOwnProperty(w)) { 64 | sum += carry 65 | carry = 0 66 | if (!sum) { 67 | sum = 1 68 | } 69 | sum *= multiples[w] 70 | sums.push(sum) 71 | sum = 0 72 | continue 73 | } 74 | // support 'quatre vingt dix', etc 75 | if (multiLeft.hasOwnProperty(w)) { 76 | let { add, skip } = scanAhead(terms, i) 77 | if (skip > 0) { 78 | carry += add 79 | i += skip 80 | continue 81 | } 82 | } 83 | 84 | // 'cent' 85 | if (tags.has('Multiple')) { 86 | let mult = toNumber[w] || 1 87 | if (carry === 0) { 88 | carry = 1 89 | } 90 | sum += mult * carry 91 | carry = 0 92 | continue 93 | } 94 | // 'trois' 95 | if (toNumber.hasOwnProperty(w)) { 96 | carry += toNumber[w] 97 | } else { 98 | let n = Number(w) 99 | if (n) { 100 | carry += n 101 | } else { 102 | // console.log('missing', w) //TODO: fixme 103 | } 104 | } 105 | } 106 | // include any remaining 107 | if (carry !== 0) { 108 | sum += carry 109 | } 110 | sums.push(sum) 111 | sum = sums.reduce((h, n) => { 112 | return h + n 113 | }, 0) 114 | if (minus === true) { 115 | sum *= -1 116 | } 117 | return sum 118 | } 119 | export default parseNumbers -------------------------------------------------------------------------------- /data/lexicon/index.js: -------------------------------------------------------------------------------- 1 | //directory of files to pack with `node scripts/pack.js` 2 | //they are stored in compressed form 3 | import lex from './misc.js' 4 | 5 | import firstnames from './people/firstnames.js' 6 | import lastnames from './people/lastnames.js' 7 | import maleNames from './people/maleNames.js' 8 | import femaleNames from './people/femaleNames.js' 9 | import honorifics from './people/honorifics.js' 10 | import people from './people/people.js' 11 | 12 | import countries from './places/countries.js' 13 | import regions from './places/regions.js' 14 | import places from './places/places.js' 15 | import cities from './places/cities.js' 16 | 17 | import cardinals from './numbers/cardinals.js' 18 | import ordinals from './numbers/ordinals.js' 19 | import units from './numbers/units.js' 20 | 21 | import infinitives from './verbs/infinitives.js' 22 | 23 | import masculine from './nouns/masculine.js' 24 | import feminine from './nouns/feminine.js' 25 | import sportsTeams from './nouns/sportsTeams.js' 26 | import organizations from './nouns/organizations.js' 27 | import possessives from './nouns/possessives.js' 28 | import pronouns from './nouns/pronouns.js' 29 | import uncountables from './nouns/uncountables.js' 30 | import nouns from './nouns/nouns.js' 31 | 32 | import masc from './adjectives/masc.js' 33 | 34 | import dates from './dates/dates.js' 35 | import months from './dates/months.js' 36 | import weekdays from './dates/weekdays.js' 37 | 38 | import adverbs from './misc/adverbs.js' 39 | import conjunctions from './misc/conjunctions.js' 40 | import currencies from './misc/currencies.js' 41 | import expressions from './misc/expressions.js' 42 | import determiners from './misc/determiners.js' 43 | import prepositions from './misc/prepositions.js' 44 | //add-in the generic, flat word-lists 45 | const data = [ 46 | [firstnames, 'FirstName'], 47 | [lastnames, 'LastName'], 48 | [maleNames, 'MaleName'], 49 | [femaleNames, 'FemaleName'], 50 | [honorifics, 'Honorific'], 51 | [people, 'Person'], 52 | 53 | [countries, 'Country'], 54 | [regions, 'Region'], 55 | [places, 'Place'], 56 | [cities, 'City'], 57 | 58 | [cardinals, 'Cardinal'], 59 | [ordinals, 'Ordinal'], 60 | [units, 'Unit'], 61 | 62 | [infinitives, 'Infinitive'], 63 | 64 | [masculine, 'MaleNoun'], 65 | [feminine, 'FemaleNoun'], 66 | [sportsTeams, 'SportsTeam'], 67 | [organizations, 'Organization'], 68 | [possessives, 'Possessive'], 69 | [pronouns, 'Pronoun'], 70 | [uncountables, 'Uncountable'], 71 | [nouns, 'Noun'], 72 | 73 | [masc, 'MaleAdjective'], 74 | 75 | [adverbs, 'Adverb'], 76 | [conjunctions, 'Conjunction'], 77 | [currencies, 'Currency'], 78 | [expressions, 'Expression'], 79 | [determiners, 'Determiner'], 80 | [prepositions, 'Preposition'], 81 | 82 | [dates, 'Date'], 83 | [months, 'Month'], 84 | [weekdays, 'WeekDay'], 85 | ] 86 | for (let i = 0; i < data.length; i++) { 87 | const list = data[i][0] 88 | for (let o = 0; o < list.length; o++) { 89 | //log duplicates 90 | // if (lex[list[o]]) { 91 | // console.log(list[o] + ' ' + lex[list[o]] + ' ' + data[i][1]) 92 | // } 93 | lex[list[o]] = data[i][1] 94 | } 95 | } 96 | 97 | export default lex 98 | // console.log(Object.keys(lex).length); 99 | // console.log(lex['mars']) 100 | -------------------------------------------------------------------------------- /scratch.js: -------------------------------------------------------------------------------- 1 | import nlp from './src/index.js' 2 | nlp.verbose('tagger') 3 | /* 4 | 5 | */ 6 | 7 | 8 | // console.log(nlp('essayer').verbs().conjugate()) 9 | 10 | let root = 'errer' 11 | let arr = [ 12 | // mauvais 13 | // 'Elle a eu une mauvaise expérience', 14 | // devenir 15 | // 'Elle est devenue une célèbre', //passe-compose 16 | 17 | // bénir 18 | // 'Que Dieu te bénisse avec bonheur', //subjunctive 19 | 20 | // revendiquer 21 | // 'Il revendiqua avoir vu un OVNI.', //passe-simple 22 | 23 | // accroupir 24 | // `Elle s'est accroupie derrière l'arbre`, //passe anterior 25 | 26 | 27 | // ménage 28 | // `Les tâches ménagères `, 29 | 30 | // nier 31 | // `la nouvelle loi nierait leurs droits`, //conditional 32 | 33 | // vieux 34 | // `La vieille maison`, 35 | // `une collection de photographies`, 36 | 37 | // promouvoir 38 | // `Elle a été promue à un poste`, // 39 | 40 | // pleuvoir 41 | // `quand il pleut `, 42 | 43 | // refléter 44 | // `Je réfléchis toujours`, //? 45 | 46 | // rôtir 47 | // `Elle a rôti une dinde`, //passe compose 48 | 49 | 50 | // soupirer 51 | // `Elle soupira `, //passe simple 52 | 53 | // envoler 54 | // `La montgolfière au-dessus des montagnes`, 55 | 56 | // // chanceler 57 | // `Il chez lui `, 58 | 59 | 60 | // épais 61 | // `une couverture épaisse`, 62 | 63 | // essayer 64 | // `Elle essaie de parler `, 65 | 66 | // errer 67 | `Le vieil homme et se perdit.`, //passe simple 68 | // ["devenir", "become", "Verb", "She a famous singer after years of practice.", "Elle est devenue une célèbre chanteuse après des années de pratique."], 69 | // ["accroupir", "crouch", "Verb", "She behind the tree to hide.", "Elle s'est accroupie derrière l'arbre pour se cacher."], 70 | 71 | // ["endormi", "asleep", "Adjective", "I love listening to music while falling .", "J'aime écouter de la musique en m'endormant."], 72 | // ["mauvais", "bad", "Adjective", "She had a experience with her previous boss.", "Elle a eu une mauvaise expérience avec son ancien patron."], 73 | // ["épais", "thick", "Adjective", "The book has a cover.", "Le livre a une couverture épaisse."], 74 | 75 | 76 | 77 | // ['Il pêche la truite tous', 'pêcher'], 78 | // [`L'équipe a été vaincue lors du match final`, 'vaincre'], 79 | // ['', ''], 80 | // 'accroupir', 81 | 82 | // 'Il abrégera son nom ', 83 | // 'marcher', 84 | // 'ralentir', 85 | // 'vendre', 86 | // 'hier', 87 | // // 'célèbre', 88 | // // 'très délicieux ', 89 | // 'Le gâteau était très délicieux ', 90 | // 'j\'ai lu trois livres', 91 | // `nous détestons le sable`, 92 | // `deuxième`, 93 | // 'vieillir', 94 | // 'envahir', 95 | // 'réfléchir', 96 | // 'des coûts « démontre que le gouvernement »', 97 | ] 98 | // let [fr, en, pos, enTxt, frTxt] = arr[0] 99 | 100 | // console.log(fr, pos) 101 | let doc = nlp(arr[0]).debug() 102 | doc.match(`{${root}}`).debug() 103 | console.log(nlp(root).verbs().conjugate()) 104 | 105 | // console.log(doc.verbs().conjugate()) 106 | // doc.verbs().toPastTense().debug() 107 | // doc.numbers().toNumber() 108 | // doc.debug() 109 | 110 | 111 | // let doc = nlp('4th sept') 112 | // let m = doc.match('[#Value] [#Month]') 113 | // m.debug() 114 | // m.groups().date.debug() 115 | // m.groups().month.debug() -------------------------------------------------------------------------------- /src/01-one/lexicon/methods/verb/index.js: -------------------------------------------------------------------------------- 1 | import { convert, reverse } from 'suffix-thumb' 2 | import model from '../model.js' 3 | 4 | // ---verbs-- 5 | const reverseAll = function (obj) { 6 | return Object.keys(obj).reduce((h, k) => { 7 | h[k] = reverse(obj[k]) 8 | return h 9 | }, {}) 10 | } 11 | 12 | const doVerb = function (str, m) { 13 | return { 14 | first: convert(str, m.je), 15 | second: convert(str, m.tu), 16 | third: convert(str, m.il), 17 | firstPlural: convert(str, m.nous), 18 | secondPlural: convert(str, m.vous), 19 | thirdPlural: convert(str, m.ils), 20 | } 21 | } 22 | const doOneVerb = function (str, form, m) { 23 | if (form === 'FirstPerson') { 24 | return convert(str, m.je) 25 | } 26 | if (form === 'SecondPerson') { 27 | return convert(str, m.tu) 28 | } 29 | if (form === 'ThirdPerson') { 30 | return convert(str, m.il) 31 | } 32 | if (form === 'FirstPersonPlural') { 33 | return convert(str, m.nous) 34 | } 35 | if (form === 'SecondPersonPlural') { 36 | return convert(str, m.vous) 37 | } 38 | if (form === 'ThirdPersonPlural') { 39 | return convert(str, m.ils) 40 | } 41 | return str 42 | } 43 | 44 | const toPresentTense = (str) => doVerb(str, model.presentTense) 45 | const toFutureTense = (str) => doVerb(str, model.futureTense) 46 | const toImperfect = (str) => doVerb(str, model.imperfect) 47 | const toPastParticiple = (str) => convert(str, model.pastParticiple.prt) 48 | 49 | const fromPresent = reverseAll(model.presentTense) 50 | const fromPresentTense = (str, form) => doOneVerb(str, form, fromPresent) 51 | 52 | const fromFuture = reverseAll(model.futureTense) 53 | const fromFutureTense = (str, form) => doOneVerb(str, form, fromFuture) 54 | 55 | const fromImperfect = reverseAll(model.imperfect) 56 | const fromImperfectTense = (str, form) => doOneVerb(str, form, fromImperfect) 57 | 58 | const fromParticiple = reverse(model.pastParticiple.prt) 59 | const fromPastParticiple = (str) => convert(str, fromParticiple) 60 | 61 | // do this one manually 62 | const fromPassive = function (str) { 63 | str = str.replace(/ées$/, 'er') 64 | str = str.replace(/ée$/, 'er') 65 | str = str.replace(/és$/, 'er') 66 | str = str.replace(/é$/, 'er') 67 | return str 68 | } 69 | 70 | // i don't really know how this works 71 | const toPassive = function (str) { 72 | if (str.endsWith('er')) { 73 | return [ 74 | str.replace(/er$/, 'ées'), 75 | str.replace(/er$/, 'ée'), 76 | str.replace(/er$/, 'és'), 77 | str.replace(/er$/, 'é'), 78 | ] 79 | } 80 | return [] 81 | } 82 | 83 | // an array of every inflection, for '{inf}' syntax 84 | const all = function (str) { 85 | let arr = [str].concat( 86 | Object.values(toPresentTense(str)), 87 | Object.values(toFutureTense(str)), 88 | Object.values(toImperfect(str)), 89 | toPassive(str) 90 | ) 91 | arr.push(toPastParticiple(str)) 92 | arr = arr.filter(s => s) 93 | arr = new Set(arr) 94 | return Array.from(arr) 95 | } 96 | 97 | export default { 98 | all, 99 | toPresentTense, toFutureTense, toImperfect, toPastParticiple, 100 | fromPresentTense, fromFutureTense, fromImperfectTense, fromPastParticiple, fromPassive 101 | } 102 | 103 | // console.log(presentTense('marcher')) 104 | // console.log(futureTense('marcher')) 105 | // console.log(imperfect('marcher')) 106 | // console.log(pastParticiple('marcher')) 107 | // console.log(noun('roche')) 108 | // console.log(adjective('gentil')) -------------------------------------------------------------------------------- /plugins/dates/tests/dates.test.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from './_lib.js' 3 | let here = '[fr-dates] ' 4 | 5 | //yep, 6 | let jan = '01' 7 | let feb = '02' 8 | let mar = '03' 9 | let apr = '04' 10 | let may = '05' 11 | let june = '06' 12 | let july = '07' 13 | let august = '08' 14 | let sept = '09' 15 | let oct = '10' 16 | let nov = '11' 17 | let dec = '12' 18 | const today = [1998, 2, 2] 19 | const opts = { timezone: 'UTC', today } 20 | 21 | const arr = [ 22 | [`je suis né le 2 septembre 1982`, [1982, sept, 2]], 23 | [`Je travaille jusqu'en juin.`, [1998, 3, 2], [1998, june, 1]], 24 | [`Il n'y a pas d'augmentation prévue jusqu'en 2032`, [2032, jan, 1]], 25 | [`Je suis en vacances jusqu'au 3 janvier.`, [1998, jan, 3]], 26 | [`Je peux t'emprunter ta voiture jusqu'à lundi prochain`, [1998, feb, 17]], 27 | ['Nous avons acheté la maison le 15 avril 2013.', [2013, apr, 15]], 28 | ['Le 1er mai est un jour férié en France', [1998, may, 1]], 29 | ['Je vais y aller le premier décembre 2014.', [2014, dec, 1]], 30 | [`le 8 aout 2014.`, [2014, august, 8]], 31 | [`Aujourd'hui, c'est le 8 septembre 2024.`, [2024, sept, 8]], 32 | [`Nous sommes le 1er février aujourd'hui.`, [1998, feb, 1]], 33 | [`Nous sommes le vendredi 1er février aujourd'hui`, [1998, feb, 1]], 34 | ['15/12/2020', [2020, dec, 15]], 35 | ['5/2/2020', [2020, feb, 5]], 36 | ['12/01/2018', [2018, jan, 12]], 37 | // ['01/13/2018', [2018, jan, 13]], 38 | ['Le 6 avril', [1998, apr, 6]], 39 | ['Mercredi 11 mars', [1998, mar, 11]], 40 | ['12/06/2020', [2020, june, 12]], 41 | ['Halloween est le 31 octobre.', [1998, oct, 31]], 42 | [`C'est le quatorze juillet.`, [1998, july, 14]], 43 | [`c'est le premier janvier`, [1998, jan, 1]], 44 | ['le 5 juin 2012', [2012, june, 5]], 45 | ['Juin 5, 2012', [2012, june, 5]], 46 | ['6/5/2012', [2012, may, 6]], 47 | ['le 25 décembre 2012', [2012, dec, 25]], 48 | ['December 25, 2012', [2012, dec, 25]], 49 | ['12/15/2012', [2012, dec, 15]], 50 | ['le 3 novembre 2012', [2012, nov, 3]], 51 | ['Novembre 3, 2021', [2021, nov, 3]], // have 2 years in slug 52 | ['3/11/21', [2021, nov, 3]], 53 | ['entre sept et oct', [1998, sept, 1], [1998, oct, 1]], 54 | ['demain à 10h', [1998, feb, 3]], // tomorrow at 10am 55 | ['lundi 20', [1998, apr, 20]], // next monday 20th 56 | ['lundi 20 à 10h', [1998, apr, 20]], // next monday 20th at 10am 57 | ['hier soir', [1998, feb, 12]], // yesterday evening 58 | ['semaine prochaine', [1998, feb, 17]], // next week 59 | ['14h30 demain', [1998, feb, 3]], // 2:30pm tomorow 60 | ['demain matin à 9h', [1998, feb, 3]], // tomorrow morning at 9am 61 | ['hier après-midi', [1998, feb, 1]], //yesterday afternoon 62 | ] 63 | 64 | const padZero = num => String(num).padStart(2, '0') 65 | 66 | test('dates:', function (t) { 67 | arr.forEach(a => { 68 | let [str, start, end] = a 69 | // make them ISOs 70 | start = start.map(padZero).join('-') 71 | end = end || [] 72 | end = end.map(padZero).join('-') 73 | 74 | let doc = nlp(str) 75 | // t.equal(doc.has('#Date'), true, here + `has-date: '${str}'`) 76 | 77 | let json = doc.dates(opts).json({ terms: false })[0] || { date: [] } 78 | let dates = json.dates[0] || { start: '', end: '' } 79 | 80 | // test the start date is the ISO 81 | let iso = dates.start.replace(/T00:00:00\.000Z$/, '', '') 82 | t.equal(iso, start, here + `[start]: ${str}`) 83 | // test the end date is the ISO 84 | if (end) { 85 | iso = dates.end.replace(/T.*$/, '', '') 86 | t.equal(iso, end, `[end]: ${str}`) 87 | } 88 | }) 89 | t.end() 90 | }) -------------------------------------------------------------------------------- /data/lexicon/places/regions.js: -------------------------------------------------------------------------------- 1 | //some major 'second-level' administrative divisions 2 | export default [ 3 | 'alabama', 4 | 'alaska', 5 | 'arizona', 6 | 'arkansas', 7 | 'california', 8 | 'colorado', 9 | 'connecticut', 10 | 'delaware', 11 | 'florida', 12 | 'georgia', 13 | 'hawaii', 14 | 'idaho', 15 | 'illinois', 16 | 'indiana', 17 | 'iowa', 18 | 'kansas', 19 | 'kentucky', 20 | 'louisiana', 21 | 'maine', 22 | 'maryland', 23 | 'massachusetts', 24 | 'michigan', 25 | 'minnesota', 26 | 'mississippi', 27 | 'missouri', 28 | 'montana', 29 | 'nebraska', 30 | 'nevada', 31 | 'new hampshire', 32 | 'new jersey', 33 | 'new mexico', 34 | 'new york state', 35 | 'new york', 36 | 'north carolina', 37 | 'north dakota', 38 | 'ohio', 39 | 'oklahoma', 40 | 'oregon', 41 | 'pennsylvania', 42 | 'rhode island', 43 | 'south carolina', 44 | 'south dakota', 45 | 'tennessee', 46 | 'texas', 47 | 'utah', 48 | 'vermont', 49 | 'virginia', 50 | 'washington dc', 51 | 'washington', 52 | 'west virginia', 53 | 'wisconsin', 54 | 'wyoming', 55 | 56 | //canada 57 | 'alberta', 58 | 'british columbia', 59 | 'manitoba', 60 | 'new brunswick', 61 | 'newfoundland', 62 | 'newfoundland and labrador', 63 | 'nova scotia', 64 | 'nunavut', 65 | 'ontario', 66 | 'prince edward island', 67 | 'pei', 68 | 'quebec', 69 | 'saskatchewan', 70 | 'yukon', 71 | 72 | //australia 73 | 'norfolk', 74 | 'queensland', 75 | 'tasmania', 76 | 'victoria', 77 | 78 | //china 79 | 'qinghai', 80 | 'sichuan', 81 | 'gansu', 82 | 'hunan', 83 | 'guangdong', 84 | 'guizhou', 85 | 'fujian', 86 | 'jiangxi', 87 | 88 | //india 89 | 'rajasthan', 90 | 'madhya', 91 | 'maharashtra', 92 | 'uttar pradesh', 93 | 'kashmir', 94 | 'gujarat', 95 | 'karnataka', 96 | 'manipur', 97 | 'odisha', 98 | 99 | //mexico 100 | 'aguascalientes', 101 | 'baja california', 102 | 'campeche', 103 | 'chiapas', 104 | 'chihuahua', 105 | 'coahuila', 106 | 'colima', 107 | 'durango', 108 | 'guanajuato', 109 | 'guerrero', 110 | 'hidalgo', 111 | 'jalisco', 112 | 'michoacan', 113 | 'morelos', 114 | 'nayarit', 115 | 'nuevo leon', 116 | 'oaxaca', 117 | 'queretaro', 118 | 'quintana roo', 119 | 'san luis potosi', 120 | 'sinaloa', 121 | 'sonora', 122 | 'tabasco', 123 | 'tamaulipas', 124 | 'tlaxcala', 125 | 'veracruz', 126 | 'yucatan', 127 | 'zacatecas', 128 | 129 | //western-europe 130 | 'basque', 131 | 'bavaria', 132 | 'bremen', 133 | 'buckinghamshire', 134 | 'cambridgeshire', 135 | 'corsica', 136 | 'coventry', 137 | 'cumbria', 138 | 'derbyshire', 139 | 'dorset', 140 | 'essex', 141 | 'gloucestershire', 142 | 'hampshire', 143 | 'hertfordshire', 144 | 'lancashire', 145 | 'leeds', 146 | 'leicestershire', 147 | 'lincolnshire', 148 | 'midlands', 149 | 'normandy', 150 | 'north yorkshire', 151 | 'northamptonshire', 152 | 'nottinghamshire', 153 | 'oxfordshire', 154 | 'saxony', 155 | 'sicily', 156 | 'somerset', 157 | 'staffordshire', 158 | 'suffolk', 159 | 'surrey', 160 | 'sussex', 161 | 'tuscany', 162 | 'warwickshire', 163 | 'yorkshire', 164 | 165 | //bangladesh 166 | 'rajshahi', 167 | 'rangpur', 168 | 'khulna', 169 | 'sylhet', 170 | 171 | //brazil 172 | 'minas gerais', 173 | 'bahia', 174 | 'parana', 175 | 'pernambuco', 176 | 'ceara', 177 | 'para', 178 | 'maranhao', 179 | 'santa catarina', 180 | 181 | //misc 182 | 'siberia', 183 | ] 184 | -------------------------------------------------------------------------------- /plugins/dates/tests/backburner/ambig-month.ignore.js: -------------------------------------------------------------------------------- 1 | import test from 'tape' 2 | import nlp from './_lib.js' 3 | import spacetime from 'spacetime' 4 | 5 | const fmt = (iso) => (iso ? spacetime(iso).format('{iso-short}') : '-') 6 | 7 | test('this month', function (t) { 8 | let arr = [ 9 | [2020, 11, 1], 10 | [2020, 11, 8], 11 | [2020, 11, 11], 12 | [2020, 11, 20], 13 | [2020, 11, 25], 14 | [2020, 11, 31], 15 | [2020, 11, 31], 16 | ] 17 | arr.forEach((a) => { 18 | let doc = nlp('this month') 19 | let found = doc.dates({ today: a }).json()[0] || {} 20 | t.equal(fmt((found.dates || {}).start), '2020-12-01', 'this-start') 21 | t.equal(fmt((found.dates || {}).end), '2020-12-31', 'this-end') 22 | }) 23 | t.end() 24 | }) 25 | 26 | test('next month', function (t) { 27 | let arr = [ 28 | [2020, 11, 1], 29 | [2020, 11, 8], 30 | [2020, 11, 11], 31 | [2020, 11, 20], 32 | [2020, 11, 25], 33 | [2020, 11, 31], 34 | [2020, 11, 31], 35 | ] 36 | arr.forEach((a) => { 37 | let doc = nlp('next month') 38 | let found = doc.dates({ today: a }).json()[0] || {} 39 | t.equal(fmt((found.dates || {}).start), '2021-01-01', 'next-start') 40 | t.equal(fmt((found.dates || {}).end), '2021-01-31', 'next-end') 41 | }) 42 | t.end() 43 | }) 44 | 45 | test('last month', function (t) { 46 | let arr = [ 47 | [2020, 11, 1], 48 | [2020, 11, 8], 49 | [2020, 11, 11], 50 | [2020, 11, 20], 51 | [2020, 11, 25], 52 | [2020, 11, 31], 53 | [2020, 11, 31], 54 | ] 55 | arr.forEach((a) => { 56 | let doc = nlp('last month') 57 | let found = doc.dates({ today: a }).json()[0] || {} 58 | t.equal(fmt((found.dates || {}).start), '2020-11-01', 'last-start') 59 | t.equal(fmt((found.dates || {}).end), '2020-11-30', 'last-end') 60 | }) 61 | t.end() 62 | }) 63 | 64 | test('this december', function (t) { 65 | let arr = [ 66 | [2020, 1, 1], 67 | [2020, 2, 8], 68 | [2020, 3, 11], 69 | [2020, 4, 20], 70 | [2020, 5, 25], 71 | [2020, 6, 28], 72 | [2020, 7, 12], 73 | [2020, 8, 12], 74 | [2020, 9, 16], 75 | [2020, 10, 1], 76 | [2020, 11, 11], 77 | ] 78 | arr.forEach((a) => { 79 | let doc = nlp('this december') 80 | let found = doc.dates({ today: a }).json()[0] || {} 81 | t.equal(fmt((found.dates || {}).start), '2020-12-01', 'this december') 82 | t.equal(fmt((found.dates || {}).end), '2020-12-31', 'this december') 83 | 84 | doc = nlp('next december') 85 | found = doc.dates({ today: a }).json()[0] || {} 86 | t.equal(fmt((found.dates || {}).start), '2021-12-01', 'next december') 87 | t.equal(fmt((found.dates || {}).end), '2021-12-31', 'next december') 88 | 89 | doc = nlp('last december') 90 | found = doc.dates({ today: a }).json()[0] || {} 91 | t.equal(fmt((found.dates || {}).start), '2019-12-01', 'last december') 92 | t.equal(fmt((found.dates || {}).end), '2019-12-31', 'last december') 93 | }) 94 | t.end() 95 | }) 96 | 97 | test('this september', function (t) { 98 | let doc = nlp('this september') 99 | let found = doc.dates({ today: [2019, 7, 4] }).json()[0] || {} 100 | t.equal(fmt((found.dates || {}).start), '2019-09-01', 'this sept - before') 101 | 102 | found = doc.dates({ today: [2019, 8, 4] }).json()[0] || {} 103 | t.equal(fmt((found.dates || {}).start), '2019-09-01', 'this sept - during') 104 | 105 | found = doc.dates({ today: [2019, 9, 4] }).json()[0] || {} 106 | t.equal(fmt((found.dates || {}).start), '2020-09-01', 'this sept - after') 107 | t.end() 108 | }) 109 | -------------------------------------------------------------------------------- /data/lexicon/nouns/sportsTeams.js: -------------------------------------------------------------------------------- 1 | export default [ 2 | //mlb 3 | 'arizona diamondbacks', 4 | 'atlanta braves', 5 | 'baltimore orioles', 6 | 'boston red sox', 7 | 'chicago cubs', 8 | 'chicago white sox', 9 | 'cincinnati reds', 10 | 'cleveland indians', 11 | 'colorado rockies', 12 | 'detroit tigers', 13 | 'houston astros', 14 | 'kansas city royals', 15 | 'los angeles dodgers', 16 | 'miami marlins', 17 | 'milwaukee brewers', 18 | 'minnesota twins', 19 | 'new york mets', 20 | 'new york yankees', 21 | 'oakland athletics', 22 | 'philadelphia phillies', 23 | 'pittsburgh pirates', 24 | 'san diego padres', 25 | 'san francisco giants', 26 | 'seattle mariners', 27 | 'st. louis cardinals', 28 | 'tampa bay rays', 29 | 'texas rangers', 30 | 'toronto blue jays', 31 | 'washington nationals', 32 | 'diamondbacks', 33 | 'white sox', 34 | 'astros', 35 | 'dodgers', 36 | 'mets', 37 | 'yankees', 38 | 'phillies', 39 | 'padres', 40 | 41 | //nba 42 | 'boston celtics', 43 | 'brooklyn nets', 44 | 'new york knicks', 45 | 'philadelphia 76ers', 46 | 'toronto raptors', 47 | 'chicago bulls', 48 | 'cleveland cavaliers', 49 | 'detroit pistons', 50 | 'indiana pacers', 51 | 'milwaukee bucks', 52 | 'atlanta hawks', 53 | 'charlotte hornets', 54 | 'miami heat', 55 | 'orlando magic', 56 | 'washington wizards', 57 | 'dallas mavericks', 58 | 'houston rockets', 59 | 'memphis grizzlies', 60 | 'new orleans pelicans', 61 | 'san antonio spurs', 62 | 'denver nuggets', 63 | 'minnesota timberwolves', 64 | 'portland trail blazers', 65 | 'oklahoma city thunder', 66 | 'utah jazz', 67 | 'golden state warriors', 68 | 'los angeles clippers', 69 | 'los angeles lakers', 70 | 'phoenix suns', 71 | 'sacramento kings', 72 | 'knicks', 73 | 'lakers', 74 | 'celtics', 75 | 76 | //nfl 77 | 'arizona cardinals', 78 | 'atlanta falcons', 79 | 'baltimore ravens', 80 | 'buffalo bills', 81 | 'carolina panthers', 82 | 'chicago bears', 83 | 'cincinnati bengals', 84 | 'cleveland browns', 85 | 'dallas cowboys', 86 | 'denver broncos', 87 | 'detroit lions', 88 | 'green bay packers', 89 | 'houston texans', 90 | 'indianapolis colts', 91 | 'jacksonville jaguars', 92 | 'kansas city chiefs', 93 | 'miami dolphins', 94 | 'minnesota vikings', 95 | 'new england patriots', 96 | 'new orleans saints', 97 | 'new york giants', 98 | 'new york jets', 99 | 'oakland raiders', 100 | 'philadelphia eagles', 101 | 'pittsburgh steelers', 102 | 'san diego chargers', 103 | 'san francisco 49ers', 104 | 'seattle seahawks', 105 | 'st. louis rams', 106 | 'tampa bay buccaneers', 107 | 'tennessee titans', 108 | 'washington redskins', 109 | 110 | //mls 111 | 'atlanta united', 112 | 'chicago fire', 113 | 'colorado rapids', 114 | 'columbus crew sc', 115 | 'd.c. united', 116 | 'fc dallas', 117 | 'houston dynamo', 118 | 'la galaxy', 119 | 'minnesota united', 120 | 'montreal impact', 121 | 'new england revolution', 122 | 'new york city fc', 123 | 'new york red bulls', 124 | 'philadelphia union', 125 | 'portland timbers', 126 | 'real salt lake', 127 | 'san jose earthquakes', 128 | 'seattle sounders', 129 | 'sporting kansas city', 130 | 'vancouver whitecaps', 131 | //premier league soccer (mostly city+fc) 132 | 'aston villa', 133 | 'blackburn rovers', 134 | 'cardiff city', 135 | 'leicester city', 136 | 'manchester city', 137 | 'manchester united', 138 | 'newcastle united', 139 | 'queens park rangers', 140 | 'sheffield united', 141 | 'stoke city', 142 | 'tottenham hotspur', 143 | 'west ham united', 144 | ] 145 | -------------------------------------------------------------------------------- /learn/giga/test.js: -------------------------------------------------------------------------------- 1 | import { forEachSync } from './_giga.js' 2 | import doSentences from './french.js' 3 | import fs from 'fs' 4 | import nlp from '../../src/index.js' 5 | 6 | 7 | let ids = [] 8 | for (let i = 1; i <= 10; i += 1) { 9 | let str = String(i).padStart(4, '0') 10 | ids.push(str) 11 | } 12 | ids = ['0004'] 13 | 14 | let tagMap = { 15 | 'ABR': 'Acronym',//abbreviation 16 | 'ADJ': 'Adjective',//adjective 17 | 'ADV': 'Adverb',//adjective 18 | 'DET:ART': 'Determiner',//article 19 | 'DET:POS': 'Pronoun',//possessive pronoun (ma, ta, ...) 20 | 'INT': 'Interjection',//interjection 21 | 'KON': 'Conjunction',//conjunction 22 | 'NAM': 'ProperNoun',//proper name 23 | 'NOM': 'Noun',//noun 24 | 'NUM': 'Value',//numeral 25 | 'PRO': 'Pronoun',//pronoun 26 | 'PRO:DEM': 'Pronoun',//demonstrative pronoun 27 | 'PRO:IND': 'Pronoun',//indefinite pronoun 28 | 'PRO:PER': 'Pronoun',//personal pronoun 29 | 'PRO:POS': 'Pronoun',//possessive pronoun (mien, tien, ...) 30 | 'PRO:REL': 'Pronoun',//relative pronoun 31 | 'PRP': 'Preposition',//preposition 32 | 'PRP:det': 'Preposition',//preposition plus article (au,du,aux,des) 33 | // 'PUN':'',//punctuation 34 | // 'PUN:cit':'',//punctuation citation 35 | // 'SENT':'',//sentence tag 36 | // 'SYM':'',//symbol 37 | 'VER:cond': 'Verb',//verb conditional 38 | 'VER:futu': 'Verb',//verb futur 39 | 'VER:impe': 'Verb',//verb imperative 40 | 'VER:impf': 'Verb',//verb imperfect 41 | 'VER:infi': 'Verb',//verb infinitive 42 | 'VER:pper': 'Verb',//verb past participle 43 | 'VER:ppre': 'Verb',//verb present participle 44 | 'VER:pres': 'Verb',//verb present 45 | 'VER:simp': 'Verb',//verb simple past 46 | 'VER:subi': 'Verb',//verb subjunctive imperfect 47 | 'VER:subp': 'Verb',//verb subjunctive present 48 | } 49 | 50 | const ignore = new Set(['au', 'aux', 'des', 'au', 'ne', '$', '.', '(', ')', 'se']) 51 | 52 | let bad = {} 53 | 54 | let right = 0 55 | let wrong = 0 56 | const doBoth = function (both) { 57 | let txt = both.fr.map(o => o['$text']).join(' ') 58 | txt = txt.replace(/ ([.,?):])/g, `$1`) 59 | let correct = {} 60 | both.fr.forEach((term, i) => { 61 | let tag = tagMap[term['$'].pos] 62 | if (tag) { 63 | let str = term['$text'].toLowerCase() 64 | correct[str] = tag 65 | } 66 | }) 67 | let doc = nlp(txt) 68 | doc.terms().forEach(t => { 69 | let str = t.text('normal') 70 | let want = correct[str] || null 71 | if (want && !ignore.has(str)) { 72 | if (t.has('#' + want)) { 73 | right += 1 74 | } else { 75 | wrong += 1 76 | bad[str] = bad[str] || 0 77 | bad[str] += 1 78 | // console.log(txt) 79 | // console.log(want) 80 | // t.debug() 81 | } 82 | } 83 | }) 84 | } 85 | 86 | 87 | // setInterval(() => { 88 | // let all = Object.entries(bad).sort((a, b) => { 89 | // if (a[1] > b[1]) { 90 | // return -1 91 | // } else if (a[1] < b[1]) { 92 | // return 1 93 | // } 94 | // return 0 95 | // }) 96 | // all = all.slice(0, 100) 97 | // console.log(all) 98 | // }, 10000) 99 | 100 | const percent = (part, total) => { 101 | let num = (part / total) * 100; 102 | num = Math.round(num * 10) / 10; 103 | return num; 104 | }; 105 | 106 | await forEachSync(ids, async id => { 107 | try { 108 | console.log(`\ndoing ${id}:\n`) 109 | await doSentences(id, doBoth) 110 | console.log(right, ` right ${percent(right, right + wrong)}%`) 111 | } catch (e) { 112 | console.log(e) 113 | } 114 | }) 115 | console.log(right, ` right ${percent(right, right + wrong)}%`) 116 | console.log(wrong, ` wrong ${percent(wrong, right + wrong)}%`) -------------------------------------------------------------------------------- /scripts/types.ts: -------------------------------------------------------------------------------- 1 | // a smoke-test for our typescipt typings 2 | import frCompromise from '../' 3 | import tape from 'tape' 4 | console.log('\n 🥗 - running types-test..\n') 5 | 6 | tape('misc functions', function (t) { 7 | let doc = frCompromise('John and Joe walked to the store') 8 | let m = doc.filter(s => s.found) 9 | let b = doc.map(s => s) 10 | doc.forEach((s) => s) 11 | let o = doc.find(s => s.found) 12 | m = doc.some(s => s.found) 13 | m = doc.random() 14 | m = doc.all() 15 | m = doc.eq(0) 16 | m = doc.first() 17 | m = doc.firstTerms() 18 | m = doc.fullSentences() 19 | m = doc.last() 20 | m = doc.lastTerms() 21 | m = doc.none() 22 | m = doc.slice(0, 1) 23 | m = doc.terms() 24 | m = doc.update([]) 25 | m = doc.toView([]) 26 | m = doc.fromText('') 27 | m = doc.clone() 28 | let obj = doc.groups() 29 | let arr = doc.termList() 30 | let c = doc.wordCount() 31 | doc.fullPointer 32 | doc.docs 33 | doc.pointer 34 | doc.methods 35 | doc.model 36 | doc.hooks 37 | doc.isView 38 | doc.found 39 | doc.length 40 | 41 | // One 42 | doc.compute('id') 43 | // change 44 | m = doc.toLowerCase() 45 | m = doc.toUpperCase() 46 | m = doc.toTitleCase() 47 | m = doc.toCamelCase() 48 | m = doc.insertAfter('asdf') 49 | m = doc.insertBefore('boo') 50 | m = doc.append('foo') 51 | m = doc.prepend('foo') 52 | m = doc.insert('bar') 53 | m = doc.match('flood').replaceWith('asf') 54 | m = doc.replace('m', 'woo') 55 | m = doc.remove('foo') 56 | m = doc.delete('bar') 57 | m = doc.pre(' ') 58 | m = doc.post(' ') 59 | m = doc.trim() 60 | m = doc.hyphenate() 61 | m = doc.dehyphenate() 62 | m = doc.toQuotations() 63 | m = doc.toParentheses() 64 | m = doc.deHyphenate() 65 | m = doc.toQuotation() 66 | m = doc.unique() 67 | m = doc.reverse() 68 | m = doc.sort() 69 | m = doc.concat(doc.none()) 70 | // doc.fork() 71 | 72 | doc.compute('contractions') 73 | doc.compute('lexicon') 74 | doc.lookup(['blue jays', 'farmer']) 75 | 76 | // match 77 | m = doc.matchOne('#Foo') 78 | m = doc.match('#Foo') 79 | let bool = doc.has('#Foo') 80 | m = doc.if('#Foo') 81 | m = doc.ifNo('#Foo') 82 | m = doc.before('#Foo') 83 | m = doc.after('#Foo') 84 | m = doc.growLeft('#Foo') 85 | m = doc.growRight('#Foo') 86 | m = doc.grow('#Foo') 87 | m = doc.splitOn('#Foo') 88 | m = doc.splitBefore('#Foo') 89 | m = doc.splitAfter('#Foo') 90 | m = doc.split('#Foo') 91 | 92 | // output 93 | let res = doc.out() 94 | let txt = doc.text() 95 | txt = doc.text('normal') 96 | txt = doc.text('machine') 97 | txt = doc.text('root') 98 | txt = doc.text('implicit') 99 | txt = doc.json() 100 | 101 | // sets 102 | m = doc.union('blah') 103 | m = doc.and('blah') 104 | m = doc.intersection('blah') 105 | m = doc.difference('blah') 106 | m = doc.not('blah') 107 | m = doc.complement('blah') 108 | m = doc.settle('blah') 109 | 110 | m = doc.tag('Foo') 111 | m = doc.tagSafe('Foo') 112 | m = doc.unTag('Foo') 113 | m = doc.canBe('Foo') 114 | 115 | doc.compute('alias') 116 | doc.compute('normal') 117 | doc.compute('machine') 118 | doc.compute('freq') 119 | doc.compute('offset') 120 | doc.compute('index') 121 | doc.compute('wordCount') 122 | 123 | doc.compute('typeahead') 124 | doc.autoFill() 125 | 126 | // sweep 127 | let matches = [ 128 | { match: '2nd quarter of? 2022', tag: 'TimePeriod' }, 129 | { match: '(from|by|before) now', tag: 'FooBar' }, 130 | ] 131 | let net = frCompromise.buildNet(matches) 132 | doc = frCompromise(`so good by now. woo hoo before now. in the 2nd quarter 2022`) 133 | let sr = doc.sweep(net) 134 | 135 | // lazy 136 | doc = frCompromise.lazy('hello', 'foo') 137 | 138 | t.ok(true) 139 | t.end() 140 | }) 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /plugins/dates/src/phrase/date/index.js: -------------------------------------------------------------------------------- 1 | import { months, days } from './data.js' 2 | import { Moment, Month, Day, Week, Year } from './units.js' 3 | import spacetime from 'spacetime' 4 | 5 | 6 | 7 | 8 | // some re-used helper functions: 9 | const parseMonth = function (m) { 10 | let str = m.text('normal') 11 | if (months.hasOwnProperty(str)) { 12 | return months[str] - 1 13 | } 14 | return null 15 | } 16 | const parseNumber = function (m) { 17 | let str = m.text('normal') 18 | str = str.replace(/e$/, '')//ordinal 19 | return parseInt(str, 10) 20 | } 21 | 22 | const isValid = function (cal) { 23 | // if (!cal.month || !cal.date || !cal.year) { 24 | // return false 25 | // } 26 | return true 27 | } 28 | 29 | // pull-apart a spcific date, like 'le 2e oct' independant of a longer phrase 30 | const parseOne = function (m, opts) { 31 | const { today } = opts 32 | // clean it up a little 33 | // m = normalize(m) 34 | // match '2 septembre 1982' 35 | let res = m.match('[#Value] [#Month] [#Year]') 36 | if (res.found) { 37 | let cal = { 38 | month: parseMonth(res.groups('month')), 39 | date: parseNumber(res.groups('date')), 40 | year: parseNumber(res.groups('year')), 41 | } 42 | if (isValid(cal)) { 43 | return new Day(cal, opts) 44 | } 45 | } 46 | // 'oct 2021' 47 | res = m.match('[#Month] [#Year]') 48 | if (res.found) { 49 | let cal = { 50 | month: parseMonth(res.groups('month')), 51 | year: parseNumber(res.groups('year')) || today.year(), 52 | } 53 | if (isValid(cal)) { 54 | return new Month(cal, opts) 55 | } 56 | } 57 | // 'oct 22nd' 58 | res = m.match('[#Month] [#Value] #Year?') 59 | if (res.found) { 60 | let cal = { 61 | month: parseMonth(res.groups('month')), 62 | date: parseNumber(res.groups('date')) || today.date(), 63 | year: parseNumber(res.match('#Year')) || today.year(), 64 | } 65 | if (isValid(cal)) { 66 | return new Day(cal, opts) 67 | } 68 | } 69 | // '6 avril' 70 | res = m.match('[#Value] [#Month] #Year?') 71 | if (res.found) { 72 | let cal = { 73 | // month: parseMonth(res.groups('month')), 74 | // date: parseNumber(res.groups('date')) || today.date(), 75 | month: parseMonth(res.match('#Month')), 76 | date: parseNumber(res.match('#Value')) || today.date(), 77 | year: parseNumber(res.match('#Year')) || today.year(), 78 | } 79 | if (isValid(cal)) { 80 | return new Day(cal, opts) 81 | } 82 | } 83 | // '2021' 84 | res = m.match('[#Year]') 85 | if (res.found) { 86 | let cal = { year: parseNumber(res.groups('year')) } 87 | if (isValid(cal)) { 88 | return new Year(cal, opts) 89 | } 90 | } 91 | // 'octobre' 92 | res = m.match('[#Month]') 93 | if (res.found) { 94 | let cal = { month: parseMonth(res.groups('month')), year: today.year() } 95 | if (isValid(cal)) { 96 | return new Month(cal, opts) 97 | } 98 | } 99 | // '2021-02-12' 100 | res = m.match('#Date+') 101 | if (res.found) { 102 | let s = spacetime(res.text('normal'), opts.timezone, { dmy: true }) 103 | if (s.isValid()) { 104 | return new Moment(s, opts) 105 | } 106 | } 107 | // known words 108 | // yesterday 109 | if (m.has('hier')) { 110 | let s = spacetime(null, opts.timezone).minus(1, 'day') 111 | return new Day(s, opts) 112 | } 113 | // tomorrow 114 | if (m.has('demain')) { 115 | let s = spacetime(null, opts.timezone).plus(1, 'day') 116 | return new Day(s, opts) 117 | } 118 | // today 119 | if (m.has('aujourd\'hui')) { 120 | let s = spacetime(null, opts.timezone) 121 | return new Day(s, opts) 122 | } 123 | 124 | // todo: support other forms here! ↓ 125 | 126 | 127 | return null 128 | } 129 | export default parseOne -------------------------------------------------------------------------------- /src/02-two/preTagger/compute/3rd-pass/verb-form.js: -------------------------------------------------------------------------------- 1 | let person = ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural'] 2 | 3 | let whichForm = [ 4 | // future 5 | ['ai', 'FirstPerson'], 6 | ['tas', 'SecondPerson'], 7 | ['ta', 'ThirdPerson'], 8 | ['âmes', 'FirstPersonPlural'], 9 | ['âtes', 'SecondPersonPlural'], 10 | ['èrent', 'ThirdPersonPlural'], 11 | // imperfect 12 | ['ait', 'ThirdPerson'], 13 | // futur 14 | ['eras', 'SecondPerson'], 15 | ['eront', 'ThirdPersonPlural'], 16 | // imparfait 17 | ['asse', 'FirstPerson'], 18 | ['asses', 'SecondPerson'], 19 | ['tât', 'ThirdPerson'], 20 | // present 21 | ['es', 'SecondPerson'], 22 | ['ons', 'FirstPersonPlural'], 23 | ['ez', 'SecondPersonPlural'], 24 | ['ent', 'ThirdPersonPlural'], 25 | ] 26 | const pronouns = { 27 | je: 'FirstPerson', 28 | tu: 'SecondPerson', 29 | il: 'ThirdPerson', 30 | elle: 'ThirdPerson', 31 | nous: 'FirstPersonPlural', 32 | vous: 'SecondPersonPlural', 33 | ils: 'ThirdPersonPlural', 34 | } 35 | // can give us a hint to verb person, too 36 | const auxiliaries = { 37 | // etre 38 | suis: 'FirstPerson', 39 | es: 'SecondPerson', 40 | est: 'ThirdPerson', 41 | sommes: 'FirstPersonPlural', 42 | êtes: 'SecondPersonPlural', 43 | sont: 'ThirdPersonPlural', 44 | serai: 'FirstPerson', 45 | seras: 'SecondPerson', 46 | sera: 'ThirdPerson', 47 | serons: 'FirstPersonPlural', 48 | serez: 'SecondPersonPlural', 49 | seront: 'ThirdPersonPlural', 50 | serait: 'ThirdPerson', 51 | serions: 'FirstPersonPlural', 52 | seriez: 'SecondPersonPlural', 53 | seraient: 'ThirdPersonPlural', 54 | 55 | // 'avoir' 56 | ai: 'FirstPerson', 57 | as: 'SecondPerson', 58 | a: 'ThirdPerson', 59 | avons: 'FirstPersonPlural', 60 | avez: 'SecondPersonPlural', 61 | ont: 'ThirdPersonPlural', 62 | // future anterior 63 | aurai: 'FirstPerson', 64 | auras: 'SecondPerson', 65 | aura: 'ThirdPerson', 66 | aurons: 'FirstPersonPlural', 67 | aurez: 'SecondPersonPlural', 68 | auront: 'ThirdPersonPlural', 69 | // Plus-que-parfait 70 | 'avait': 'ThirdPerson', 71 | 'avions': 'FirstPersonPlural', 72 | 'aviez': 'SecondPersonPlural', 73 | 'avaient': 'ThirdPersonPlural', 74 | // conditional avoir 75 | aurait: 'ThirdPerson', 76 | aurions: 'FirstPersonPlural', 77 | auriez: 'SecondPersonPlural', 78 | auraient: 'ThirdPersonPlural', 79 | } 80 | 81 | // guess a tense tag each Verb 82 | const verbForm = function (terms, i, world) { 83 | let setTag = world.methods.one.setTag 84 | let term = terms[i] 85 | let tags = term.tags 86 | if (tags.has('Verb')) { 87 | // console.log(term) 88 | let str = term.implicit || term.normal || term.text || '' 89 | // if we have no person-tag 90 | if (!person.find(s => tags.has(s))) { 91 | // look at the word suffix, for clues 92 | let found = whichForm.find(a => str.endsWith(a[0])) 93 | if (found) { 94 | return setTag([term], found[1], world, false, '3-person-suffix-' + found[1]) 95 | } 96 | //look backwards for clues 97 | for (let back = 0; back < 3; back += 1) { 98 | if (!terms[i - back]) { 99 | break 100 | } 101 | let s = terms[i - back].normal 102 | //look backwards for a pronoun 103 | if (terms[i - back].tags.has('Pronoun')) { 104 | if (pronouns.hasOwnProperty(s)) { 105 | return setTag([term], pronouns[s], world, false, '3-person-pronoun-' + s) 106 | } 107 | } 108 | //look backwards for a auxiliary verb - 'sont' 109 | if (terms[i - back].tags.has('Verb')) { 110 | if (auxiliaries.hasOwnProperty(s)) { 111 | return setTag([term], auxiliaries[s], world, false, '3-person-auxiliary-' + s) 112 | } 113 | } 114 | } 115 | } 116 | } 117 | return null 118 | } 119 | export default verbForm -------------------------------------------------------------------------------- /src/02-two/postTagger/matches.js: -------------------------------------------------------------------------------- 1 | import nounGender from '../preTagger/compute/3rd-pass/noun-gender.js' 2 | import nounPlurals from '../preTagger/compute/3rd-pass/noun-plurals.js' 3 | import adjGender from '../preTagger/compute/3rd-pass/adj-gender.js' 4 | import adjPlurals from '../preTagger/compute/3rd-pass/adj-plurals.js' 5 | import verbTense from '../preTagger/compute/3rd-pass/verb-tense.js' 6 | 7 | const tagNoun = function (m) { 8 | let world = m.world 9 | m.docs.forEach(terms => { 10 | terms.forEach((_t, i) => { 11 | nounGender(terms, i, world) 12 | nounPlurals(terms, i, world) 13 | }) 14 | }) 15 | } 16 | const tagAdj = function (m) { 17 | let world = m.world 18 | m.docs.forEach(terms => { 19 | terms.forEach((_t, i) => { 20 | adjGender(terms, i, world) 21 | adjPlurals(terms, i, world) 22 | }) 23 | }) 24 | } 25 | const tagVerb = function (m) { 26 | let world = m.world 27 | m.docs.forEach(terms => { 28 | terms.forEach((_t, i) => { 29 | verbTense(terms, i, world) 30 | }) 31 | }) 32 | } 33 | 34 | const postTagger = function (doc) { 35 | // ==Nouns== 36 | // l'inconnu 37 | doc.match('(le|un) [#Verb]', 0).tag(['MaleNoun', 'Singular'], 'le-verb') 38 | doc.match('(la|une) [#Verb]', 0).tag(['FemaleNoun', 'Singular'], 'la-verb') 39 | tagNoun(doc.match('(quelques|quelque) [#Verb]', 0).tag('Noun', 'quelque-verb')) 40 | tagNoun(doc.match('(des|les|mes|ces|tes|ses|nos|vos|leurs) [#Verb]', 0).tag('PluralNoun', 'des-verb')) 41 | 42 | // ==Verbs== 43 | // ne foo pas 44 | tagVerb(doc.match('ne [.] pas', 0).tag('Verb', 'ne-verb-pas')) 45 | // il active le 46 | tagVerb(doc.match('il [.] (le|la|les)', 0).tag('Verb', 'il-verb-le')) 47 | // reflexive 48 | tagVerb(doc.match('(se|me|te) [.]', 0).tag('Verb', 'se-noun')) 49 | // Elle interdit les transactions 50 | tagVerb(doc.match('(je|tu|il|elle|nous|vous|ils) [#Adjective] (la|le|les)', 0).tag('Verb', 'ils-x-les')) 51 | // sont interdites par l'interdiction 52 | tagVerb(doc.match('(est|été|sont|était|serait) [#Adjective] #Preposition', 0).tag('Verb', 'song-x-par')) 53 | // a dissimulées 54 | tagVerb(doc.match('(ai|as|a|avons|avez|ont) [#Adjective]', 0).tag('PastTense', 'have-adj')) 55 | // have unpacked 56 | doc.match('(ai|as|a|avons|avez|ont) [#PresentTense]', 0).tag('PastTense', 'have-pres') 57 | // passive voice - est-aimée 58 | doc.match('#Copula #Adverb?+ [#PastParticiple]', 0).tag('Passive', 'passive-voice') 59 | 60 | // ==Adjectives== 61 | // est bien calculée 62 | tagAdj(doc.match('#Copula (bien|très|pas|plus|tant|presque|seulement)+ [#Verb]', 0).tag('Adjective', 'est-bein-calculee')) 63 | 64 | // ==Numbers== 65 | doc.match('#Value et (un|#Value)').tag('TextValue', 'et-un') 66 | doc.match('#Value un').tag('TextValue', 'quatre-vingt-un') 67 | doc.match('moins #Value').tag('TextValue', 'moins-value') 68 | 69 | // ==Dates== 70 | doc.match('[#Value] #Month', 0).tag('Date', 'val-month') 71 | // ambig 'sept' 72 | doc.match('#Month [#Value] #Year', 0).tag('Date', 'mdy') 73 | doc.match('[#Value] #Month #Year', 0).tag('Date', 'dmy') 74 | doc.match('le #Value [sept]', 0).tag('Month', 'val-sept') 75 | doc.match('[sept] #Year', 0).tag('Month', 'sept-year') 76 | doc.match('[sept] (et|ou) #Month', 0).tag('Month', 'sept-et-month') 77 | doc.match('sept$').tag('TextValue', 'sept-alone') 78 | doc.match('et [sept]').tag('TextValue', 'et-sept') 79 | // sept trente 80 | doc.match('sept (dix|vingt|trente|quarante|cinquante|soixante|soixante|#Multiple)').tag('TextValue', 'sept-trente') 81 | doc.match('(dix|vingt|trente|quarante|cinquante|soixante|soixante|#Multiple) sept').tag('TextValue', 'trente-sept') 82 | // // sept-et-jun 83 | // doc.match('#Date [et] #Date', 0).tag('Date', 'date-et-date') 84 | // // courant juin 85 | // doc.match('(en|entre|depuis|courant|pendant|dans|lorsque|avant|après) #Date').tag('Date', 'depuis-date') 86 | // // jusque'en juin 87 | // doc.match('jusque (en|à) #Date').tag('Date', 'jusque-date') 88 | // // au cours de juin 89 | // doc.match('au cours de #Date').tag('Date', 'au-cours-de-date') 90 | } 91 | export default postTagger --------------------------------------------------------------------------------