├── src
├── _version.js
├── 03-three
│ ├── numbers
│ │ ├── plugin.js
│ │ ├── find.js
│ │ ├── parse
│ │ │ ├── _data.js
│ │ │ ├── index.js
│ │ │ └── fromText.js
│ │ ├── format
│ │ │ ├── index.js
│ │ │ └── toText.js
│ │ └── data.js
│ ├── topics
│ │ ├── plugin.js
│ │ └── api.js
│ ├── contractions
│ │ ├── plugin.js
│ │ └── api.js
│ ├── nouns
│ │ ├── plugin.js
│ │ └── api.js
│ ├── adjectives
│ │ ├── plugin.js
│ │ └── api.js
│ └── verbs
│ │ ├── plugin.js
│ │ └── api
│ │ ├── adverbs.js
│ │ ├── parse.js
│ │ ├── toJSON.js
│ │ └── find.js
├── 02-two
│ ├── preTagger
│ │ ├── methods
│ │ │ ├── index.js
│ │ │ └── guessGender.js
│ │ ├── compute
│ │ │ ├── 2nd-pass
│ │ │ │ ├── noun-fallback.js
│ │ │ │ ├── neighbours.js
│ │ │ │ ├── suffix-lookup.js
│ │ │ │ └── acronym.js
│ │ │ ├── 3rd-pass
│ │ │ │ ├── fix-contractions.js
│ │ │ │ ├── adj-plurals.js
│ │ │ │ ├── number-types.js
│ │ │ │ ├── noun-gender.js
│ │ │ │ ├── noun-plurals.js
│ │ │ │ ├── adj-gender.js
│ │ │ │ ├── verb-tense.js
│ │ │ │ └── verb-form.js
│ │ │ ├── 1st-pass
│ │ │ │ ├── titlecase.js
│ │ │ │ ├── regex.js
│ │ │ │ └── year.js
│ │ │ └── index.js
│ │ ├── model
│ │ │ ├── index.js
│ │ │ ├── regex
│ │ │ │ ├── regex-text.js
│ │ │ │ ├── regex-normal.js
│ │ │ │ └── regex-numbers.js
│ │ │ └── suffixes.js
│ │ ├── plugin.js
│ │ └── tagRank.js
│ ├── postTagger
│ │ ├── plugin.js
│ │ └── matches.js
│ └── tagset
│ │ ├── plugin.js
│ │ └── tags
│ │ ├── values.js
│ │ ├── dates.js
│ │ ├── misc.js
│ │ ├── nouns.js
│ │ └── verbs.js
├── _lib.js
├── 01-one
│ ├── lexicon
│ │ ├── methods
│ │ │ ├── index.js
│ │ │ ├── model.js
│ │ │ ├── noun
│ │ │ │ └── index.js
│ │ │ ├── adjective
│ │ │ │ └── index.js
│ │ │ └── verb
│ │ │ │ └── index.js
│ │ ├── plugin.js
│ │ ├── model
│ │ │ ├── misc.js
│ │ │ └── lexicon.js
│ │ └── compute
│ │ │ └── root.js
│ └── tokenize
│ │ ├── plugin.js
│ │ ├── compute
│ │ ├── index.js
│ │ └── machine.js
│ │ ├── contractions.js
│ │ └── unicode.js
└── index.js
├── data
├── lexicon
│ ├── misc
│ │ ├── determiners.js
│ │ ├── conjunctions.js
│ │ ├── prepositions.js
│ │ ├── expressions.js
│ │ ├── currencies.js
│ │ └── adverbs.js
│ ├── dates
│ │ ├── dates.js
│ │ ├── weekdays.js
│ │ └── months.js
│ ├── nouns
│ │ ├── feminine.js
│ │ ├── pronouns.js
│ │ ├── possessives.js
│ │ ├── uncountables.js
│ │ ├── masculine.js
│ │ └── sportsTeams.js
│ ├── numbers
│ │ ├── ordinals.js
│ │ ├── cardinals.js
│ │ └── units.js
│ ├── people
│ │ ├── firstnames.js
│ │ ├── honorifics.js
│ │ └── people.js
│ ├── misc.js
│ ├── places
│ │ ├── places.js
│ │ └── regions.js
│ └── index.js
└── models
│ ├── _lint.js
│ └── index.js
├── plugins
└── dates
│ ├── src
│ ├── phrase
│ │ ├── date
│ │ │ ├── 01-date.js
│ │ │ ├── 02-year.js
│ │ │ ├── 03-misc.js
│ │ │ ├── data.js
│ │ │ ├── units.js
│ │ │ └── index.js
│ │ ├── normalize.js
│ │ └── index.js
│ ├── plugin.js
│ ├── toJson.js
│ ├── find.js
│ └── api.js
│ ├── tests
│ ├── _lib.js
│ ├── backburner
│ │ ├── ambig-weekday.ignore.js
│ │ ├── equals.ignore.js
│ │ ├── to-iso.ignore.js
│ │ └── ambig-month.ignore.js
│ └── dates.test.js
│ ├── README.md
│ ├── rollup.config.js
│ ├── index.d.ts
│ ├── scratch.js
│ └── package.json
├── .gitignore
├── tmp.js
├── learn
├── giga
│ ├── makeModel.js
│ ├── french.js
│ ├── _giga.js
│ ├── getList.js
│ ├── getPairs.js
│ ├── corpus.js
│ └── test.js
├── adjectives
│ └── learn.js
├── wiktionary
│ ├── add.js
│ └── index.js
├── wikinews
│ ├── packSuffixes.js
│ ├── parse.js
│ ├── getLexicon.js
│ └── getSuffix.js
├── nouns
│ └── learn.js
├── wolf
│ └── parse_wolf.js
└── verbs
│ ├── old.js
│ ├── toPairs.js
│ ├── single-pairs.js
│ └── learn.js
├── scripts
├── version.js
├── stress.js
├── cleanup.js
├── pack.js
└── types.ts
├── tests
├── _lib.js
├── buildNet.test.js
├── conjugate.test.js
└── numbers
│ ├── ordinal.test.js
│ └── number-misc.test.js
├── add-verbs.js
├── rollup.config.js
├── .esformatter
├── changelog.md
├── LICENSE
├── .github
└── workflows
│ └── build-and-test.yml
├── .eslintrc
├── package.json
├── types
├── view
│ └── fr.ts
├── index.d.ts
└── misc.ts
└── scratch.js
/src/_version.js:
--------------------------------------------------------------------------------
1 | export default '0.2.8'
--------------------------------------------------------------------------------
/src/03-three/numbers/plugin.js:
--------------------------------------------------------------------------------
1 | import api from './api.js'
2 |
3 | export default {
4 | api
5 | }
--------------------------------------------------------------------------------
/src/03-three/topics/plugin.js:
--------------------------------------------------------------------------------
1 | import api from './api.js'
2 |
3 | export default {
4 | api
5 | }
--------------------------------------------------------------------------------
/data/lexicon/misc/determiners.js:
--------------------------------------------------------------------------------
1 | export default ['le', 'la', 'les', 'au', 'aux', 'ol', 'un', 'une']
2 |
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/date/01-date.js:
--------------------------------------------------------------------------------
1 | const parse = function () {
2 |
3 | }
4 | export default parse
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/date/02-year.js:
--------------------------------------------------------------------------------
1 | const parse = function () {
2 |
3 | }
4 | export default parse
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/date/03-misc.js:
--------------------------------------------------------------------------------
1 | const parse = function () {
2 |
3 | }
4 | export default parse
--------------------------------------------------------------------------------
/src/03-three/contractions/plugin.js:
--------------------------------------------------------------------------------
1 | import api from './api.js'
2 |
3 | export default {
4 | api
5 | }
--------------------------------------------------------------------------------
/src/03-three/nouns/plugin.js:
--------------------------------------------------------------------------------
1 | import api from './api.js'
2 |
3 | export default {
4 | api,
5 | }
6 |
--------------------------------------------------------------------------------
/src/03-three/adjectives/plugin.js:
--------------------------------------------------------------------------------
1 | import api from './api.js'
2 |
3 | export default {
4 | api,
5 | }
6 |
--------------------------------------------------------------------------------
/src/03-three/verbs/plugin.js:
--------------------------------------------------------------------------------
1 | import api from './api/api.js'
2 |
3 | export default {
4 | api,
5 | }
6 |
--------------------------------------------------------------------------------
/data/lexicon/dates/dates.js:
--------------------------------------------------------------------------------
1 | // uncontroversial date words
2 | export default ['aujourd\'hui', 'demain', 'hier', 'weekend']
3 |
--------------------------------------------------------------------------------
/data/lexicon/misc/conjunctions.js:
--------------------------------------------------------------------------------
1 | export default ['et', 'mais', 'soit', 'puis', 'car', 'voire', 'sinon', 'comme', 'donc']
2 |
--------------------------------------------------------------------------------
/data/lexicon/nouns/feminine.js:
--------------------------------------------------------------------------------
1 | export default ['confiture', 'géologie', 'librairie', 'ambulance', 'poule', 'rue', 'lutte']
2 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/methods/index.js:
--------------------------------------------------------------------------------
1 | import guessGender from './guessGender.js'
2 | export default { one: { guessGender } }
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | build/
3 | .DS_Store
4 | coverage
5 | wolf-1.0b4.xml
6 | wikinews.txt
7 | /learn/giga/results/*.js
8 | learn/scrape/*
--------------------------------------------------------------------------------
/src/02-two/postTagger/plugin.js:
--------------------------------------------------------------------------------
1 | import postTagger from './matches.js'
2 |
3 | export default {
4 | compute: {
5 | postTagger
6 | },
7 | hooks: ['postTagger']
8 | }
--------------------------------------------------------------------------------
/src/_lib.js:
--------------------------------------------------------------------------------
1 | // console.log('local-path')
2 | // import nlp from '/Users/spencer/mountain/compromise/src/one.js'
3 | import nlp from 'compromise/one'
4 | export default nlp
--------------------------------------------------------------------------------
/src/01-one/lexicon/methods/index.js:
--------------------------------------------------------------------------------
1 | import adjective from './adjective/index.js'
2 | import noun from './noun/index.js'
3 | import verb from './verb/index.js'
4 |
5 | export default { adjective, noun, verb }
6 |
--------------------------------------------------------------------------------
/tmp.js:
--------------------------------------------------------------------------------
1 | import verbs from './data/models/verb/present-tense.js'
2 | import lex from './data/lexicon/index.js'
3 | Object.keys(verbs).forEach(k => {
4 | if (!lex[k]) {
5 | console.log(k)
6 | }
7 | })
8 |
9 |
--------------------------------------------------------------------------------
/data/lexicon/nouns/pronouns.js:
--------------------------------------------------------------------------------
1 | // are these right?
2 | export default [
3 | 'il',
4 | 'c',
5 | 'elle',
6 | 'on',
7 | 'ils',
8 | 'nous',
9 | 'je',
10 | 'ce',
11 | 'j',
12 | 'elles',
13 | 'vous',
14 | 'tu',
15 | 't',
16 | 'moi',
17 | ]
18 |
--------------------------------------------------------------------------------
/learn/giga/makeModel.js:
--------------------------------------------------------------------------------
1 | import data from './results/plural-sing.js'
2 | import { learn, compress, test, validate } from 'suffix-thumb'
3 |
4 | const pairs = validate(data)
5 | test(pairs)
6 | const model = learn(pairs)
7 | console.log(JSON.stringify(model, null, 2))
8 |
9 |
--------------------------------------------------------------------------------
/data/models/_lint.js:
--------------------------------------------------------------------------------
1 | import model from './verb/present-tense.js'
2 |
3 | Object.keys(model).forEach(k => {
4 | let s = new Set()
5 | model[k].slice(1).forEach(str => {
6 | if (s.has(str)) {
7 | console.log(k, str)
8 | }
9 | s.add(str)
10 | })
11 | })
--------------------------------------------------------------------------------
/scripts/version.js:
--------------------------------------------------------------------------------
1 | import fs from 'fs'
2 | // avoid requiring our whole package.json file
3 | // make a small file for our version number
4 | let pkg = JSON.parse(fs.readFileSync('./package.json').toString())
5 |
6 | fs.writeFileSync('./src/_version.js', `export default '${pkg.version}'`)
7 |
--------------------------------------------------------------------------------
/tests/_lib.js:
--------------------------------------------------------------------------------
1 | /* eslint-disable no-console */
2 | import build from '../builds/fr-compromise.mjs'
3 | import src from '../src/index.js'
4 | let nlp = src
5 | if (process.env.TESTENV === 'prod') {
6 | console.warn('== production build test 🚀 ==')
7 | nlp = build
8 | }
9 | export default nlp
10 |
--------------------------------------------------------------------------------
/plugins/dates/src/plugin.js:
--------------------------------------------------------------------------------
1 | import api from './api.js'
2 |
3 | let lexicon = {
4 | heir: 'Date',
5 | soir: 'Date',
6 | nuit: 'Date',
7 | 'soirée': 'Date',
8 | matin: 'Date',
9 | 'après midi': 'Date',
10 | semaine: 'Duration',
11 | }
12 |
13 | export default {
14 | words: lexicon,
15 | api,
16 | }
--------------------------------------------------------------------------------
/src/01-one/lexicon/plugin.js:
--------------------------------------------------------------------------------
1 | import methods from './methods/index.js'
2 | import words from './model/lexicon.js'
3 | import root from './compute/root.js'
4 |
5 | export default {
6 | methods: {
7 | two: {
8 | transform: methods
9 | }
10 | },
11 | words,
12 | compute: {
13 | root: root
14 | }
15 | }
--------------------------------------------------------------------------------
/src/01-one/tokenize/plugin.js:
--------------------------------------------------------------------------------
1 | import unicode from './unicode.js'
2 | import contractions from './contractions.js'
3 | import compute from './compute/index.js'
4 |
5 |
6 | export default {
7 | mutate: (world) => {
8 | world.model.one.unicode = unicode
9 |
10 | world.model.one.contractions = contractions
11 | },
12 | compute
13 | }
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/2nd-pass/noun-fallback.js:
--------------------------------------------------------------------------------
1 | const nounFallback = function (terms, i, world) {
2 | let setTag = world.methods.one.setTag
3 | let term = terms[i]
4 | if (term.tags.size === 0) {
5 | setTag([term], 'Noun', world, false, 'fallback')
6 | return true
7 | }
8 | return null
9 | }
10 | export default nounFallback
--------------------------------------------------------------------------------
/src/02-two/tagset/plugin.js:
--------------------------------------------------------------------------------
1 | import nouns from './tags/nouns.js'
2 | import verbs from './tags/verbs.js'
3 | import values from './tags/values.js'
4 | import dates from './tags/dates.js'
5 | import misc from './tags/misc.js'
6 |
7 | let tags = Object.assign({}, nouns, verbs, values, dates, misc)
8 |
9 | export default {
10 | tags
11 | }
12 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/model/index.js:
--------------------------------------------------------------------------------
1 | import regexNormal from './regex/regex-normal.js'
2 | import regexNumbers from './regex/regex-numbers.js'
3 | import regexText from './regex/regex-text.js'
4 | import suffixPatterns from './suffixes.js'
5 |
6 |
7 | export default {
8 | regexNormal,
9 | regexNumbers,
10 | regexText,
11 | suffixPatterns
12 | }
13 |
--------------------------------------------------------------------------------
/data/lexicon/nouns/possessives.js:
--------------------------------------------------------------------------------
1 | // are these right?
2 | export default ['en', 'lui', 'nous', 'leur', 'm', 'me', 'vous', 'te', 'toi', 'ce',
3 |
4 | 'mon', 'ma', 'mes',// my
5 | 'ton', 'ta', 'tes',// your
6 | 'son', 'sa', 'ses',// his
7 | 'notre', 'notre', 'nos',// our
8 | 'votre', 'votre', 'vos',// your
9 | 'leur', 'leur', 'leurs',// their
10 | ]
11 |
--------------------------------------------------------------------------------
/src/01-one/lexicon/methods/model.js:
--------------------------------------------------------------------------------
1 | import { uncompress } from 'suffix-thumb'
2 | import packed from './_data.js'
3 |
4 | // uncompress them
5 | let model = Object.keys(packed).reduce((h, k) => {
6 | h[k] = {}
7 | Object.keys(packed[k]).forEach(form => {
8 | h[k][form] = uncompress(packed[k][form])
9 | })
10 | return h
11 | }, {})
12 |
13 | export default model
--------------------------------------------------------------------------------
/src/02-two/preTagger/plugin.js:
--------------------------------------------------------------------------------
1 | import preTagger from './compute/index.js'
2 | import tagRank from './tagRank.js'
3 | import model from './model/index.js'
4 | import methods from './methods/index.js'
5 |
6 |
7 | export default {
8 | compute: {
9 | preTagger,
10 | tagRank
11 | },
12 | methods,
13 | model: {
14 | two: model
15 | },
16 | hooks: ['preTagger']
17 | }
--------------------------------------------------------------------------------
/data/lexicon/dates/weekdays.js:
--------------------------------------------------------------------------------
1 | export default [
2 | 'lundi', // - Monday.
3 | 'mardi', // - Tuesday.
4 | 'mercredi', // - Wednesday.
5 | 'jeudi', // - Thursday.
6 | 'vendredi', // - Friday.
7 | 'samedi', // - Saturday.
8 | 'dimanche', // - Sunday.
9 | 'lun', //
10 | 'mar', //
11 | 'mer', //
12 | 'jeu', //
13 | 'ven', //
14 | 'sam', //
15 | 'dim', //
16 | ]
17 |
--------------------------------------------------------------------------------
/plugins/dates/tests/_lib.js:
--------------------------------------------------------------------------------
1 | /* eslint-disable no-console */
2 | import build from '../../../builds/fr-compromise.mjs'
3 | import src from '../../../src/index.js'
4 | let nlp = src
5 | if (process.env.TESTENV === 'prod') {
6 | console.warn('== production build test 🚀 ==')
7 | nlp = build
8 | }
9 |
10 | import plg from '../src/plugin.js'
11 | nlp.plugin(plg)
12 |
13 | export default nlp
14 |
--------------------------------------------------------------------------------
/plugins/dates/src/toJson.js:
--------------------------------------------------------------------------------
1 |
2 | const toJson = function (arr) {
3 | return arr.map(o => {
4 | let res = {
5 | start: o.start.start().iso()
6 | }
7 | // either explicit or implicit end date
8 | if (o.end) {
9 | res.end = o.end.end().iso()
10 | } else {
11 | res.end = o.start.end().iso()
12 | }
13 | return res
14 | })
15 | }
16 | export default toJson
--------------------------------------------------------------------------------
/src/01-one/tokenize/compute/index.js:
--------------------------------------------------------------------------------
1 | import machine from './machine.js'
2 |
3 | // cheat-method for a quick loop
4 | const termLoop = function (view, fn) {
5 | let docs = view.docs
6 | for (let i = 0; i < docs.length; i += 1) {
7 | for (let t = 0; t < docs[i].length; t += 1) {
8 | fn(docs[i][t], view.world)
9 | }
10 | }
11 | }
12 | export default {
13 | machine: (view) => termLoop(view, machine),
14 | }
--------------------------------------------------------------------------------
/learn/adjectives/learn.js:
--------------------------------------------------------------------------------
1 | import data from './data.js'
2 | // import data from '../nouns/data.js'
3 |
4 | import { learn, compress, test } from 'suffix-thumb'
5 |
6 |
7 | const pairs = {}
8 | data.forEach(a => {
9 | let [m, f, mp, fp] = a
10 | pairs[m] = [f, mp, fp]
11 | })
12 |
13 | console.log(JSON.stringify(pairs, null, 2))
14 | // let model = learn(pairs)
15 | // model = compress(model)
16 | // console.log(JSON.stringify(model, null, 2))
17 | // test(pairs)
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/normalize.js:
--------------------------------------------------------------------------------
1 | const normalize = function (m) {
2 | m = m.clone()
3 | // remove redundant day-names like 'Wed march 2nd'
4 | if (m.has('#WeekDay') && m.has('#Month') && m.has('#NumericValue')) {
5 | m.remove('#WeekDay')
6 | }
7 | // jusqu'à le quatorze juillet
8 | m.remove('(le|la)')
9 | // quatorze -> 14
10 | m.numbers().toCardinal().toNumber()
11 | // m.compute('index')
12 | return m
13 | }
14 | export default normalize
--------------------------------------------------------------------------------
/data/lexicon/numbers/ordinals.js:
--------------------------------------------------------------------------------
1 | export default [
2 | 'zeroième',
3 | 'premier',
4 | 'unième',
5 | 'deuxième',
6 | 'troisième',
7 | 'quatrième',
8 | 'cinquième',
9 | 'sixième',
10 | 'septième',
11 | 'huitième',
12 | 'neuvième',
13 | 'dixième',
14 | 'onzième',
15 | 'douzième',
16 | 'treizième',
17 | 'quatorzième',
18 | 'quinzième',
19 | 'seizième',
20 | 'vingtième',
21 | 'trentième',
22 | 'quarantième',
23 | 'cinquantième',
24 | 'soixantième',
25 | ]
26 |
--------------------------------------------------------------------------------
/src/03-three/numbers/find.js:
--------------------------------------------------------------------------------
1 | const findNumbers = function (view) {
2 | let m = view.match('#Value+')
3 |
4 | //seventh fifth
5 | if (m.match('#Ordinal #Ordinal').match('#TextValue').found && !m.has('#Multiple')) {
6 | m = m.splitAfter('#Ordinal')
7 | }
8 |
9 | //fifth five
10 | m = m.splitBefore('#Ordinal [#Cardinal]', 0)
11 | //5-8
12 | m = m.splitAfter('#NumberRange')
13 | // june 5th 1999
14 | m = m.splitBefore('#Year')
15 | return m
16 | }
17 | export default findNumbers
--------------------------------------------------------------------------------
/src/01-one/lexicon/methods/noun/index.js:
--------------------------------------------------------------------------------
1 | import { convert, reverse } from 'suffix-thumb'
2 | import model from '../model.js'
3 |
4 | let pRev = reverse(model.noun.plural)
5 | const toPlural = (str) => convert(str, model.noun.plural)
6 | const fromPlural = (str) => convert(str, pRev)
7 |
8 | const all = (str) => {
9 | let plr = toPlural(str)
10 | if (str === plr) {
11 | return [str]
12 | }
13 | return [str, plr]
14 | }
15 | export default {
16 | toPlural,
17 | fromPlural,
18 | all
19 | }
--------------------------------------------------------------------------------
/plugins/dates/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | travaux en cours! • work-in-progress!
5 |
6 |
7 |
8 | ```js
9 | import nlp from 'fr-compromise'
10 | import frDatePlugin from 'fr-compromise-dates'
11 | nlp.plugin(frDatePlugin)
12 |
13 | let doc = nlp('entre sept et oct')
14 | doc.dates().json()[0]
15 | /*
16 | { text: 'entre sept et oct',
17 | date: [{
18 | start: { month: 9, year: 2023 },
19 | end: { month: 10, year: 2023 }
20 | }]
21 | }*/
22 | ```
23 |
24 | MIT
--------------------------------------------------------------------------------
/learn/wiktionary/add.js:
--------------------------------------------------------------------------------
1 | import fixes from './fixes.js'
2 | import adj from '../../data/models/adjective/index.js'
3 |
4 | let data = adj
5 | //m: [f, p, fp]
6 | let out = {}
7 | Object.keys(fixes).forEach(k => {
8 | let arr = fixes[k]
9 | if (arr.length === 1) {
10 | // only got a plural
11 | out[k] = [k, arr[0], arr[0]]
12 | } else if (arr.length === 3) {
13 | // only fem plurals
14 | let [m, f, fp] = arr
15 | out[k] = [f, m, fp]
16 | }
17 | })
18 | data = Object.assign(data, out)
19 | console.log(JSON.stringify(data, null, 2))
--------------------------------------------------------------------------------
/src/02-two/preTagger/model/regex/regex-text.js:
--------------------------------------------------------------------------------
1 | export default [
2 | // #coolguy
3 | [/^#[a-z0-9_\u00C0-\u00FF]{2,}$/i, 'HashTag'],
4 |
5 | // @spencermountain
6 | [/^@\w{2,}$/, 'AtMention'],
7 |
8 | // period-ones acronyms - f.b.i.
9 | [/^([A-Z]\.){2}[A-Z]?/i, ['Acronym', 'Noun'], 'F.B.I'], //ascii-only
10 |
11 | // ending-apostrophes
12 | [/.{3}[lkmnp]in['‘’‛‵′`´]$/, 'Gerund', "chillin'"],
13 | [/.{4}s['‘’‛‵′`´]$/, 'Possessive', "flanders'"],
14 |
15 | // leading contractions
16 | // [/^s'[a-z]$/, 'Verb'],
17 | // [/^l'[a-z]$/, 'Noun'],
18 | ]
19 |
--------------------------------------------------------------------------------
/data/lexicon/numbers/cardinals.js:
--------------------------------------------------------------------------------
1 | export default [
2 | 'zero', // - 0
3 | 'un', // - 1
4 | 'deux', // - 2
5 | 'trois', // - 3
6 | 'quatre', // - 4
7 | 'cinq', // - 5
8 | 'six', // - 6
9 | 'sept', // - 7
10 | 'huit', // - 8
11 | 'neuf', // - 9
12 |
13 | 'dix',
14 | 'onze',
15 | 'douze',
16 | 'treize',
17 | 'quatorze',
18 | 'quinze',
19 | 'seize',
20 | 'dix sept',
21 | 'dix huit',
22 | 'dix neuf',
23 | 'vingt',
24 | 'trente',
25 | 'quarante',
26 | 'cinquante',
27 | 'soixante',
28 | // 'quatre vingt',
29 | // 'quatre vingt dix huit',
30 |
31 | ]
32 |
--------------------------------------------------------------------------------
/plugins/dates/src/find.js:
--------------------------------------------------------------------------------
1 | const findDates = function (doc) {
2 | let m = doc.match('#Date+')
3 | // 7 jun 2018
4 | m = m.growLeft('#Value+$')
5 | m = m.growRight('^#Value+')
6 | // pendant juin
7 | m = m.growLeft('(le|la)$')// jusqu'a le
8 | m = m.growLeft('(en|entre|depuis|courant|pendant|dans|lorsque|avant|après|à|a|au)$')
9 | m = m.growLeft('au cours de$')
10 | m = m.growLeft('jusque$')// jusqu'en jusqu'à
11 | // sept-et-jun
12 | m = m.growRight('^et (le|la)? #Date+')
13 |
14 | // remove overlaps
15 | m = m.settle()
16 | // m.debug()
17 | return m
18 | }
19 | export default findDates
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/fix-contractions.js:
--------------------------------------------------------------------------------
1 | // better guesses for 'le/la/les' in l'foo
2 | const fixContractions = function (terms, i) {
3 | let term = terms[i]
4 | // let tags = term.tags
5 | if (term.implicit === 'le') {
6 | let nextTerm = terms[i + 1]
7 | if (!nextTerm) {
8 | return null
9 | }
10 | if (nextTerm.tags.has('FemaleNoun')) {
11 | term.implicit = 'la'
12 | }
13 | // support female plural?
14 | if (nextTerm.tags.has('PluralNoun')) {
15 | term.implicit = 'les'
16 | }
17 | }
18 | return null
19 | }
20 | export default fixContractions
--------------------------------------------------------------------------------
/add-verbs.js:
--------------------------------------------------------------------------------
1 | import prettyJSON from 'pretty-json-stringify'
2 |
3 | import fs from 'fs'
4 | // parse JSON-newline file
5 | let arr = fs.readFileSync('./more-verbs.jsonl').toString()
6 | .split(/\n/).filter(str => str).map(str => JSON.parse(str))
7 |
8 | let out = {}
9 | arr.forEach(obj => {
10 | if (obj['Indicatif Futur'][0]) {
11 | let str = obj['Indicatif Futur']
12 | out[obj.word] = str
13 | }
14 | })
15 | console.log(prettyJSON(out, {
16 | shouldExpand: (_, level) => level >= 1 ? false : true
17 | }))
18 |
19 | import nlp from './src/index.js'
20 | // console.log(nlp('dépister').verbs().conjugate())
21 |
22 |
--------------------------------------------------------------------------------
/data/lexicon/people/firstnames.js:
--------------------------------------------------------------------------------
1 | //ambiguously-gendered firstnames
2 | //names commonly used in either gender
3 | export default [
4 | 'alexis',
5 | 'andra',
6 | 'aubrey',
7 | 'blair',
8 | 'casey',
9 | 'cassidy',
10 | 'cheyenne',
11 | 'devan',
12 | 'devon',
13 | 'jamie',
14 | 'jammie',
15 | 'jessie',
16 | 'jude',
17 | 'kasey',
18 | 'kelsey',
19 | 'kenyatta',
20 | 'kerry',
21 | 'kris',
22 | 'lashawn',
23 | 'marion',
24 | 'marlo',
25 | 'mel',
26 | 'morgan',
27 | 'nelly',
28 | 'quinn',
29 | 'regan',
30 | 'rene',
31 | 'shay',
32 | 'shea',
33 | 'shelby',
34 | 'shiloh',
35 | ]
36 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/1st-pass/titlecase.js:
--------------------------------------------------------------------------------
1 | const isTitleCase = function (str) {
2 | return /^[A-Z][a-z'\u00C0-\u00FF]/.test(str) || /^[A-Z]$/.test(str)
3 | }
4 |
5 | // add a noun to any non-0 index titlecased word, with no existing tag
6 | const titleCaseNoun = function (terms, i, world) {
7 | let setTag = world.methods.one.setTag
8 | let term = terms[i]
9 | if (i === 0) {
10 | return null
11 | }
12 | if (term.tags.size > 0) {
13 | return null
14 | }
15 | if (isTitleCase(term.text)) {
16 | setTag([term], 'ProperNoun', world, false, 'title-case')
17 | return true
18 | }
19 | return null
20 | }
21 | export default titleCaseNoun
--------------------------------------------------------------------------------
/tests/buildNet.test.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from './_lib.js'
3 | let here = '[fr-buildNet] '
4 |
5 | test('buildNet:', function (t) {
6 | let matches = [
7 | { match: '{crier/Verb}' },
8 | { match: '{jaune/Adjective}' },
9 | { match: '{troupe/Noun}' }
10 | ]
11 | let net = nlp.buildNet(matches)
12 | t.ok(net.hooks.crier, here + 'crier')
13 | t.ok(net.hooks.criaient, here + 'criaient')
14 | t.ok(net.hooks.criaient, here + 'criaient')
15 | t.ok(net.hooks.jaune, here + 'jaune')
16 | t.ok(net.hooks.jaunes, here + 'jaunes')
17 | t.ok(net.hooks.troupe, here + 'troupe')
18 | t.ok(net.hooks.troupes, here + 'troupes')
19 | t.end()
20 | })
--------------------------------------------------------------------------------
/scripts/stress.js:
--------------------------------------------------------------------------------
1 | /* eslint-disable no-console, no-unused-vars */
2 | import corpus from 'fr-corpus' //install with `npm i fr-corpus --no-save`
3 | import nlp from '../src/index.js'
4 | let texts = corpus.all()
5 | console.log(`\n\n--- running compromise on ${texts.length.toLocaleString()} random sentences---\n`)
6 | console.log(' --should take a few minutes--')
7 |
8 | for (let i = 0; i < texts.length; i++) {
9 | let txt = texts[i][0]
10 | let doc = nlp(txt)
11 | let m = doc.match('#Determiner #Adverb #Adjective #Noun')
12 | m.forEach(d => {
13 | d.terms()
14 | })
15 | m.verbs().conjugate()
16 | doc.numbers().add(2)
17 | }
18 |
19 | console.log('\n\n - done!')
20 |
--------------------------------------------------------------------------------
/data/lexicon/dates/months.js:
--------------------------------------------------------------------------------
1 | export default [
2 | 'janvier', // - January
3 | 'février', // - February
4 | 'mars', // - March
5 | 'avril', // - April
6 | 'mai', // - May
7 | 'juin', // - June
8 | 'juillet', // - July
9 | 'aout', // - August
10 | 'septembre', // -September
11 | 'octobre', // - October
12 | 'novembre', // - November
13 | 'décembre', // - December
14 | 'fevrier',
15 | 'decembre',
16 |
17 | 'janv',
18 | 'jan',
19 | 'fév',
20 | 'fev',
21 | 'févr',
22 | 'fevr',
23 | 'mars',
24 | 'avr',
25 | 'mai',
26 | 'juin',
27 | 'juil',
28 | 'juill',
29 | 'aout',
30 | 'sept',
31 | 'oct',
32 | 'nov',
33 | 'déc',
34 | 'dec',
35 | ]
36 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/adj-plurals.js:
--------------------------------------------------------------------------------
1 | // guess a plural/singular tag each Adjective
2 | const adjPlurals = function (terms, i, world) {
3 | let setTag = world.methods.one.setTag
4 | let term = terms[i]
5 | let tags = term.tags
6 | let str = term.implicit || term.normal || term.text || ''
7 | if (tags.has('Adjective')) {
8 | if (str.endsWith('s') || str.endsWith('aux')) {
9 | return setTag([term], 'PluralAdjective', world, false, '3-plural-adj')
10 | }
11 | // if (str.endsWith('euse')) {
12 | // return setTag([term], 'SingularAdjective', world, false, '3-plural-adj')
13 | // }
14 | }
15 | return null
16 | }
17 | export default adjPlurals
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/number-types.js:
--------------------------------------------------------------------------------
1 | // const dateWords = new Set('en', 'entre', 'depuis', 'courant', 'pendant', 'dans', 'lorsque', 'avant', 'après')
2 |
3 | // guess a gender for each noun
4 | const numberTags = function (terms, i, world) {
5 | let setTag = world.methods.one.setTag
6 | let { tags } = terms[i]
7 | // tag some values as a year
8 | if (tags.has('Cardinal') && tags.has('NumericValue')) {
9 | let term = terms[i]
10 | let n = Number(term.text)
11 | if (n && n > 1600 && n < 2090 && n === parseInt(n, 10)) {
12 | return setTag([term], 'Year', world, false, '3-year')
13 | }
14 | }
15 | return null
16 | }
17 | export default numberTags
--------------------------------------------------------------------------------
/src/03-three/numbers/parse/_data.js:
--------------------------------------------------------------------------------
1 | import data from '../data.js'
2 |
3 | const toCardinal = {}
4 | const toOrdinal = {}
5 | const toNumber = {}
6 |
7 | Object.keys(data).forEach(k => {
8 | data[k].forEach(a => {
9 | let [num, w, ord] = a
10 | toCardinal[ord] = w
11 | toOrdinal[w] = ord
12 | toNumber[w] = num
13 | // add ordinal without accents
14 | let norm = ord.replace(/è/, 'e')
15 | toNumber[norm] = num
16 | })
17 | })
18 |
19 | // add some more
20 | Object.assign(toNumber, {
21 | cents: 100,
22 | milles: 1000,
23 | millions: 1000000,
24 | milliards: 1000000000,
25 | })
26 |
27 | export {
28 | toOrdinal,
29 | toCardinal,
30 | toNumber
31 | }
--------------------------------------------------------------------------------
/src/03-three/verbs/api/adverbs.js:
--------------------------------------------------------------------------------
1 | // split adverbs as before/after the root
2 | const getAdverbs = function (vb, root) {
3 | let res = {
4 | pre: vb.none(),
5 | post: vb.none(),
6 | }
7 | if (!vb.has('#Adverb')) {
8 | return res
9 | }
10 | // pivot on the main verb
11 | let parts = vb.splitOn(root)
12 | if (parts.length === 3) {
13 | return {
14 | pre: parts.eq(0).adverbs(),
15 | post: parts.eq(2).adverbs(),
16 | }
17 | }
18 | // it must be the second one
19 | if (parts.eq(0).isDoc(root)) {
20 | res.post = parts.eq(1).adverbs()
21 | return res
22 | }
23 | res.pre = parts.eq(0).adverbs()
24 | return res
25 | }
26 | export default getAdverbs
27 |
--------------------------------------------------------------------------------
/scripts/cleanup.js:
--------------------------------------------------------------------------------
1 | import keep from '../data/lexicon/nouns/nouns.js'
2 | import og from '../data/lexicon/data/neutralNouns.js'
3 |
4 | // import messy from '../data/lexicon/verbs.js'
5 | // const unique = function (arr) {
6 | // let obj = {}
7 | // for (let i = 0; i < arr.length; i += 1) {
8 | // obj[arr[i]] = true
9 | // }
10 | // return Object.keys(obj)
11 | // }
12 |
13 | // console.log(JSON.stringify(unique(messy), null, 2))
14 |
15 |
16 | let loose = og.filter(str => {
17 | let found = keep.find(s => s === str)
18 | if (found) {
19 | console.log(str)
20 | return false
21 | }
22 | return true
23 | })
24 |
25 | console.log(og.length)
26 | console.log(loose.length)
27 | // console.log(JSON.stringify(loose, null, 2))
--------------------------------------------------------------------------------
/src/02-two/tagset/tags/values.js:
--------------------------------------------------------------------------------
1 | export default {
2 | Value: {
3 | not: ['Verb', 'Adjective', 'Adverb'],
4 | },
5 | Ordinal: {
6 | is: 'Value',
7 | not: ['Cardinal'],
8 | },
9 | Cardinal: {
10 | is: 'Value',
11 | not: ['Ordinal'],
12 | },
13 | Fraction: {
14 | is: 'Value',
15 | not: ['Noun'],
16 | },
17 | Multiple: {
18 | is: 'TextValue',
19 | },
20 | RomanNumeral: {
21 | is: 'Cardinal',
22 | not: ['TextValue'],
23 | },
24 | TextValue: {
25 | is: 'Value',
26 | not: ['NumericValue'],
27 | },
28 | NumericValue: {
29 | is: 'Value',
30 | not: ['TextValue'],
31 | },
32 | Money: {
33 | is: 'Cardinal',
34 | },
35 | Percent: {
36 | is: 'Value',
37 | },
38 | }
39 |
--------------------------------------------------------------------------------
/learn/wikinews/packSuffixes.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs')
2 | const suff = require('../../src/tagger/data/suffixMap.js')
3 |
4 | // find any long suffixes that are covered by shorter ones
5 | const twos = suff[5]
6 | const twoWords = Object.keys(twos)
7 | let count = 0
8 |
9 | for (let i = 6; i <= 6; i += 1) {
10 | twoWords.forEach((ending) => {
11 | let testWords = Object.keys(suff[i])
12 | testWords.forEach((w) => {
13 | if (w.endsWith(ending)) {
14 | if (twos[ending] === suff[i][w]) {
15 | count += 1
16 | console.log('kill:', w, `(${ending})`)
17 | delete suff[i][w]
18 | }
19 | }
20 | })
21 | })
22 | }
23 |
24 | // console.log(count)
25 | console.log(JSON.stringify(suff, null, 2))
26 |
--------------------------------------------------------------------------------
/learn/giga/french.js:
--------------------------------------------------------------------------------
1 | import { streamXml } from './_giga.js'
2 | const gigaFr = '/Users/spencer/data/opus/fr/giga-fren/xml/fr/giga-fren.release2.fixed.'
3 |
4 | // kick them off
5 | const parseXml = function (id, doBoth) {
6 | const parseFR = function (item) {
7 | try {
8 | doBoth({ fr: item.w || [] })
9 | return true
10 | } catch (e) {
11 | console.log(e)
12 | }
13 | }
14 | return new Promise((resolve, reject) => {
15 |
16 | const doneMaybe = function () {
17 | console.log('--done-- ')
18 | resolve()
19 | }
20 |
21 | try {
22 | streamXml(gigaFr + `${id}.xml`, parseFR, doneMaybe)
23 | } catch (e) {
24 | console.log(e)
25 | reject(e)
26 | }
27 | })
28 | }
29 |
30 | export default parseXml
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/noun-gender.js:
--------------------------------------------------------------------------------
1 | // guess a gender for each noun
2 | const nounGender = function (terms, i, world) {
3 | let setTag = world.methods.one.setTag
4 | const guessGender = world.methods.one.guessGender
5 | let { tags } = terms[i]
6 | if (tags.has('Noun') && !tags.has('MaleNoun') && !tags.has('FemaleNoun')) {
7 | let term = terms[i]
8 | // should these have genders?
9 | if (tags.has('ProperNoun') || tags.has('Pronoun') || tags.has('Possessive')) {
10 | return null
11 | }
12 | // look for 'le', look for suffix
13 | let found = guessGender(terms, i)
14 | if (found) {
15 | return setTag([term], found, world, false, '3-noun-gender')
16 | }
17 | }
18 | return null
19 | }
20 | export default nounGender
--------------------------------------------------------------------------------
/plugins/dates/rollup.config.js:
--------------------------------------------------------------------------------
1 | import terser from '@rollup/plugin-terser'
2 | import { nodeResolve } from '@rollup/plugin-node-resolve'
3 |
4 | const opts = { keep_classnames: true, module: true }
5 |
6 | export default [
7 | {
8 | input: 'src/plugin.js',
9 | output: [{ file: 'builds/fr-compromise-dates.cjs', format: 'umd', name: 'frCompromiseDates' }],
10 | plugins: [nodeResolve()],
11 | },
12 | {
13 | input: 'src/plugin.js',
14 | output: [{ file: 'builds/fr-compromise-dates.min.js', format: 'umd', name: 'frCompromiseDates' }],
15 | plugins: [nodeResolve(), terser(opts)],
16 | },
17 | {
18 | input: 'src/plugin.js',
19 | output: [{ file: 'builds/fr-compromise-dates.mjs', format: 'esm' }],
20 | plugins: [nodeResolve(), terser(opts)],
21 | }
22 | ]
23 |
--------------------------------------------------------------------------------
/rollup.config.js:
--------------------------------------------------------------------------------
1 | import terser from '@rollup/plugin-terser'
2 | import { nodeResolve } from '@rollup/plugin-node-resolve'
3 |
4 | const opts = {
5 | keep_classnames: true,
6 | module: true,
7 | }
8 |
9 | export default [
10 | // === Main ==
11 | {
12 | input: 'src/index.js',
13 | output: [{ file: 'builds/fr-compromise.cjs', format: 'umd', name: 'frCompromise' }],
14 | plugins: [nodeResolve()],
15 | },
16 | {
17 | input: 'src/index.js',
18 | output: [{ file: 'builds/fr-compromise.min.js', format: 'umd', name: 'frCompromise' }],
19 | plugins: [nodeResolve(), terser(opts)],
20 | },
21 | {
22 | input: 'src/index.js',
23 | output: [{ file: 'builds/fr-compromise.mjs', format: 'esm' }],
24 | plugins: [nodeResolve(), terser(opts)],
25 | }
26 |
27 | ]
28 |
--------------------------------------------------------------------------------
/learn/wikinews/parse.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs')
2 |
3 | let lines = fs
4 | .readFileSync(__dirname + '/wikinews.txt')
5 | .toString()
6 | .split(/\n/)
7 |
8 | // lines = lines.slice(0, 100)
9 |
10 | const mapping = {
11 | NPP: 'N',
12 | NC: 'N',
13 | U: 'N',
14 | ET: 'N',
15 |
16 | VINF: 'V',
17 | VS: 'V',
18 | VPP: 'PastTense',
19 | VPR: 'Gerund',
20 | }
21 |
22 | lines = lines.map((str) => {
23 | let words = str.split(/ /g)
24 | words = words.map((w) => {
25 | let arr = w.split(/_/)
26 | let tag = (arr[1] || '').trim()
27 | tag = mapping[tag] || tag
28 | return {
29 | word: arr[0].trim(),
30 | tag: tag,
31 | }
32 | })
33 | words = words.filter((w) => w.tag && w.word && w.tag !== 'PONCT')
34 | return words
35 | })
36 | module.exports = lines
37 |
--------------------------------------------------------------------------------
/src/01-one/tokenize/compute/machine.js:
--------------------------------------------------------------------------------
1 | const hasDash = /^\p{Letter}+-\p{Letter}+$/u
2 | // 'machine' is a normalized form that looses human-readability
3 | const doMachine = function (term) {
4 | let str = term.implicit || term.normal || term.text
5 | // remove apostrophes
6 | str = str.replace(/['’]s$/, '')
7 | str = str.replace(/s['’]$/, 's')
8 | //lookin'->looking (make it easier for conjugation)
9 | str = str.replace(/([aeiou][ktrp])in'$/, '$1ing')
10 | //turn re-enactment to reenactment
11 | if (hasDash.test(str)) {
12 | str = str.replace(/-/g, '')
13 | }
14 | // remove accented chars
15 | // str = str.replace(/è/g, 'e')
16 | //#tags, @mentions
17 | str = str.replace(/^[#@]/, '')
18 | if (str !== term.normal) {
19 | term.machine = str
20 | }
21 | }
22 | export default doMachine
23 |
--------------------------------------------------------------------------------
/.esformatter:
--------------------------------------------------------------------------------
1 | {
2 | "plugins": [
3 | "esformatter-quotes",
4 | "esformatter-parseint",
5 | "esformatter-braces",
6 | "esformatter-semicolons"
7 | ],
8 | "quotes": {
9 | "type": "single",
10 | "avoidEscape": false
11 | },
12 | "whiteSpace": {
13 | "before": {
14 | "ParameterList": -1,
15 | "ParameterComma": -1,
16 | "FunctionDeclarationOpeningBrace": -1,
17 | "FunctionDeclarationClosingBrace": -1,
18 | "ForStatementExpressionOpening": -1
19 | },
20 | "after": {
21 | "FunctionName": -1,
22 | "ParameterComma": 1,
23 | "FunctionReservedWord": -1,
24 | "ParameterList": -1,
25 | "FunctionDeclarationOpeningBrace": -1,
26 | "PropertyName": -1
27 | }
28 | },
29 | "lineBreak": {
30 | "before": {
31 | "EndOfFile": 1
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/learn/nouns/learn.js:
--------------------------------------------------------------------------------
1 | const data = require('./data')
2 | // const toFemme = require('../../src/transforms/nouns/toFemme.js')
3 | const toMasc = require('../../src/transforms/nouns/toMasc.js')
4 | const toSigular = require('../../src/transforms/nouns/toSingular.js')
5 |
6 | const toRoot = function (str) {
7 | str = toSigular(str)
8 | str = toMasc(str)
9 | return str
10 | }
11 |
12 | const irregs = {}
13 | let count = 0
14 | data.forEach((a) => {
15 | let from = a[3]
16 | let want = a[0]
17 | let w = toRoot(from)
18 | if (w === want) {
19 | count += 1
20 | } else {
21 | // if (from.endsWith('eur')) {
22 | irregs[from] = want
23 | console.log(from + ' ➔ ' + w + ' (' + want + ')')
24 | // }
25 | }
26 | })
27 | console.log(count)
28 | console.log(count / data.length)
29 | // console.log(JSON.stringify(irregs, null, 2))
30 |
--------------------------------------------------------------------------------
/plugins/dates/index.d.ts:
--------------------------------------------------------------------------------
1 | import nlp from 'compromise'
2 | type View = ReturnType
3 |
4 | interface DateView extends View {
5 | /** convert parsed dates to a date format */
6 | format(fmt: string): View
7 | /** get parsed date metadata */
8 | get(): object[]
9 | }
10 |
11 | interface TimeView extends View {
12 | /** convert parsed dates to a time format */
13 | format(fmt: string): View
14 | /** get parsed time metadata */
15 | get(): object[]
16 | }
17 |
18 | export interface DatesMethods {
19 | /** match all date-phrases */
20 | dates(): DateView
21 | /** match time-of-day phrases */
22 | times(): TimeView
23 | /** match lengths of time, like '2 weeks' */
24 | durations(): View
25 | }
26 |
27 | /** extended compromise lib **/
28 | declare const nlpSpeed: nlp.TypedPlugin
29 |
30 | export default nlpSpeed
31 |
--------------------------------------------------------------------------------
/learn/giga/_giga.js:
--------------------------------------------------------------------------------
1 | import XmlStream from 'xml-stream'
2 | import fs from 'fs'
3 |
4 | const streamXml = function (file, cb, end) {
5 | const stream = fs.createReadStream(file)
6 | const xml = new XmlStream(stream)
7 | xml.collect('w')
8 | xml.on('endElement: s', function (item) {
9 | cb(item, xml)
10 | })
11 | xml.on('end', end)
12 | }
13 |
14 |
15 | const topk = function (arr) {
16 | let obj = {}
17 | arr.forEach(a => {
18 | obj[a] = obj[a] || 0
19 | obj[a] += 1
20 | })
21 | let res = Object.keys(obj).map(k => [k, obj[k]])
22 | res = res.sort((a, b) => (a[1] > b[1] ? -1 : 0))
23 | return res.map(a => a[0])
24 | }
25 |
26 | async function forEachSync(array, callback) {
27 | for (let i = 0; i < array.length; i++) {
28 | await callback(array[i], i, array)
29 | }
30 | }
31 |
32 |
33 | export { streamXml, forEachSync, topk }
--------------------------------------------------------------------------------
/data/lexicon/misc.js:
--------------------------------------------------------------------------------
1 | export default {
2 | n: 'Negative',
3 | ne: 'Negative',
4 | ni: 'Negative',
5 | aucun: 'Negative',
6 |
7 | se: 'Auxiliary',
8 | te: 'Auxiliary',
9 | me: 'Auxiliary',
10 |
11 | ai: 'Auxiliary',
12 | ont: 'Auxiliary',
13 |
14 | // questions
15 | ou: 'Conjunction',
16 | qui: 'Preposition',
17 | que: 'Preposition',
18 | a: 'Preposition',
19 | ces: 'Determiner',
20 | cette: 'Determiner',
21 |
22 |
23 | quelle: 'QuestionWord',
24 | // que: 'QuestionWord',
25 | qu: 'QuestionWord',
26 | quand: 'QuestionWord',
27 |
28 | '&': 'Conjunction',
29 |
30 | si: 'Condition',
31 | sinon: 'Condition',
32 | 'aujourd\'hui': 'Noun',
33 |
34 | 'quelque': 'Adjective',
35 | 'quelques': 'Adjective',
36 |
37 | // alt verbs
38 | 'essaie': 'PresentTense',
39 | 'essaies': 'PresentTense',
40 | 'essaient': 'PresentTense'
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/02-two/tagset/tags/dates.js:
--------------------------------------------------------------------------------
1 | export default {
2 | Date: {
3 | not: ['Verb', 'Adverb', 'Adjective'],
4 | },
5 | Month: {
6 | is: 'Singular',
7 | also: ['Date'],
8 | not: ['Year', 'WeekDay', 'Time'],
9 | },
10 | WeekDay: {
11 | is: 'Noun',
12 | also: ['Date'],
13 | },
14 | Year: {
15 | is: 'Date',
16 | not: ['RomanNumeral'],
17 | },
18 | FinancialQuarter: {
19 | is: 'Date',
20 | not: 'Fraction',
21 | },
22 | // 'easter'
23 | Holiday: {
24 | is: 'Date',
25 | also: ['Noun'],
26 | },
27 | // 'summer'
28 | Season: {
29 | is: 'Date',
30 | },
31 | Timezone: {
32 | is: 'Noun',
33 | also: ['Date'],
34 | not: ['ProperNoun'],
35 | },
36 | Time: {
37 | is: 'Date',
38 | not: ['AtMention'],
39 | },
40 | // 'months'
41 | Duration: {
42 | is: 'Noun',
43 | also: ['Date'],
44 | },
45 | }
46 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/model/regex/regex-normal.js:
--------------------------------------------------------------------------------
1 | export default [
2 | //web tags
3 | [/^[\w.]+@[\w.]+\.[a-z]{2,3}$/, 'Email'],
4 | [/^(https?:\/\/|www\.)+\w+\.[a-z]{2,3}/, 'Url', 'http..'],
5 | [/^[a-z0-9./].+\.(com|net|gov|org|ly|edu|info|biz|dev|ru|jp|de|in|uk|br|io|ai)/, 'Url', '.com'],
6 |
7 | // timezones
8 | [/^[PMCE]ST$/, 'Timezone', 'EST'],
9 |
10 | //names
11 | [/^ma?c'.*/, 'LastName', "mc'neil"],
12 | [/^o'[drlkn].*/, 'LastName', "o'connor"],
13 | [/^ma?cd[aeiou]/, 'LastName', 'mcdonald'],
14 |
15 | //slang things
16 | [/^(lol)+[sz]$/, 'Expression', 'lol'],
17 | [/^wo{2,}a*h?$/, 'Expression', 'wooah'],
18 | [/^(hee?){2,}h?$/, 'Expression', 'hehe'],
19 | [/^(un|de|re)\\-[a-z\u00C0-\u00FF]{2}/, 'Verb', 'un-vite'],
20 |
21 | // m/h
22 | [/^(m|k|cm|km)\/(s|h|hr)$/, 'Unit', '5 k/m'],
23 | // μg/g
24 | [/^(ug|ng|mg)\/(l|m3|ft3)$/, 'Unit', 'ug/L'],
25 | ]
26 |
--------------------------------------------------------------------------------
/learn/wolf/parse_wolf.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | var parser = require('xml2json');
3 | let fs = require('fs');
4 |
5 | let xml = fs.readFileSync(__dirname + '/wolf-1.0b4.xml', 'utf8');
6 | // let xml = fs.readFileSync(__dirname + '/tiny.xml', 'utf8');
7 | // xml to json
8 | // var xml = 'bar';
9 | var json = JSON.parse(parser.toJson(xml));
10 |
11 | let words = [];
12 |
13 | let len = json.WN.SYNSET.length;
14 | for (var i = 0; i < len; i++) {
15 | if (json.WN.SYNSET[i].SYNONYM.LITERAL !== '_EMPTY_') {
16 | if (json.WN.SYNSET[i].POS !== 'n') {
17 | continue;
18 | }
19 | let str = json.WN.SYNSET[i].SYNONYM.LITERAL['$t'];
20 | if (str) {
21 | words.push(str);
22 | } else {
23 | json.WN.SYNSET[i].SYNONYM.LITERAL.forEach(function(o) {
24 | words.push(o['$t']);
25 | });
26 | }
27 | }
28 | }
29 |
30 | console.log(JSON.stringify(words, null, 2));
31 |
--------------------------------------------------------------------------------
/data/lexicon/people/honorifics.js:
--------------------------------------------------------------------------------
1 | //extend to person-names if infront of a name - 'Professor Frink'
2 | export default [
3 | 'admiral',
4 | 'ayatullah',
5 | 'brigadier',
6 | 'captain',
7 | 'captain',
8 | 'chancellor',
9 | 'colonel',
10 | 'commander',
11 | 'congressman',
12 | 'congresswoman',
13 | 'councillor',
14 | 'count',
15 | 'doctor',
16 | 'dutchess',
17 | 'excellency',
18 | 'field marshal',
19 | 'first lady',
20 | 'first lieutenant',
21 | 'judge',
22 | 'king',
23 | 'lieutenant',
24 | 'magistrate',
25 | 'marshal',
26 | 'mayor',
27 | 'officer',
28 | 'pastor',
29 | 'president',
30 | 'prime minister',
31 | 'prince',
32 | 'princess',
33 | 'professor',
34 | 'queen',
35 | 'rabbi',
36 | 'rear admiral',
37 | 'reverend',
38 | 'second lieutenant',
39 | 'secretary',
40 | 'sergeant',
41 | 'sultan',
42 | 'taoiseach',
43 | 'vice admiral',
44 | ]
45 |
--------------------------------------------------------------------------------
/src/03-three/adjectives/api.js:
--------------------------------------------------------------------------------
1 | export const getNth = (doc, n) => (typeof n === 'number' ? doc.eq(n) : doc)
2 |
3 | // get root form of adjective
4 | const getRoot = function (m) {
5 | m.compute('root')
6 | let str = m.text('root')
7 | return str
8 | }
9 |
10 | const api = function (View) {
11 | class Adjectives extends View {
12 | constructor(document, pointer, groups) {
13 | super(document, pointer, groups)
14 | this.viewType = 'Adjectives'
15 | }
16 | conjugate(n) {
17 | const methods = this.methods.two.transform.adjective
18 | return getNth(this, n).map(m => {
19 | let adj = getRoot(m)
20 | return methods.conjugate(adj, methods)
21 | }, [])
22 | }
23 | }
24 |
25 | View.prototype.adjectives = function (n) {
26 | let m = this.match('#Adjective')
27 | m = getNth(m, n)
28 | return new Adjectives(this.document, m.pointer)
29 | }
30 | }
31 | export default api
--------------------------------------------------------------------------------
/src/02-two/preTagger/tagRank.js:
--------------------------------------------------------------------------------
1 | const boringTags = new Set(['Auxiliary', 'Possessive'])
2 |
3 | const sortByKids = function (tags, tagSet) {
4 | tags = tags.sort((a, b) => {
5 | // (unknown tags are interesting)
6 | if (boringTags.has(a) || !tagSet.hasOwnProperty(b)) {
7 | return 1
8 | }
9 | if (boringTags.has(b) || !tagSet.hasOwnProperty(a)) {
10 | return -1
11 | }
12 | let kids = tagSet[a].children || []
13 | let aKids = kids.length
14 | kids = tagSet[b].children || []
15 | let bKids = kids.length
16 | return aKids - bKids
17 | })
18 | return tags
19 | }
20 |
21 | const tagRank = function (view) {
22 | const { document, world } = view
23 | const tagSet = world.model.one.tagSet
24 | document.forEach(terms => {
25 | terms.forEach(term => {
26 | let tags = Array.from(term.tags)
27 | term.tagRank = sortByKids(tags, tagSet)
28 | })
29 | })
30 | }
31 | export default tagRank
32 |
--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
1 | ### 0.2.8 [Aug 2023]
2 |
3 | - **[fix]** - conjugtion issues
4 | - **[update]** - dependences
5 |
6 | ### 0.2.7 [May 2023]
7 |
8 | - **[fix]** - tagging
9 | - **[new]** - `fr-compromise-dates`
10 |
11 | ### 0.2.6 [Feb 2023]
12 |
13 | - **[fix]** - support multi-lexicon
14 | - **[fix]** - try new suffix thumb
15 | - **[fix]** - conjugation fixes
16 |
17 | ### 0.2.0 [Sept 2022]
18 |
19 | - **[fix]** - inflections+conjugations
20 | - **[new]** - start of verb, noun, and adjective methods
21 |
22 | ### 0.1.2 [August 2022]
23 |
24 | - **[fix]** - inflections+conjugations
25 |
26 | ### 0.1.1 [July 2022]
27 |
28 | - **[fix]** - import format
29 | - **[new]** - typescript types
30 |
31 | ### 0.1.0 [June 2022]
32 |
33 | - **[new]** - `.compute('root')`
34 | - **[new]** - number-parsing
35 |
36 | ### 0.0.2 [June 2022]
37 |
38 | - **[new]** - support root matches
39 | - **[new]** - `.compute('root')`
40 | - **[new]** - FirstPerson, SecondPerson tags etc.
41 |
--------------------------------------------------------------------------------
/src/01-one/tokenize/contractions.js:
--------------------------------------------------------------------------------
1 | export default [
2 | { word: "qu'il", out: ['que', 'il'] },
3 | { word: "n'y", out: ['ne', 'a'] },
4 | { word: "n'est", out: ['ne', 'est'] },
5 | { word: 'aux', out: ['à', 'les'] },
6 | { word: 'au', out: ['à', 'le'] },
7 | { before: 'm', out: ['me'] },
8 | { before: 's', out: ['se'] },
9 | { before: 't', out: ['tu'] },
10 | { before: 'n', out: ['ne'] },
11 | { before: 'qu', out: ['que'] },//tant qu'étudiant
12 | { before: 'puisqu', out: ['puisque'] },
13 | { before: 'lorsqu', out: ['lorsque'] },//lorsqu’il
14 | { before: 'jusqu', out: ['jusque'] },//jusqu'en
15 | { before: 'quelqu', out: ['quelque'] },//Quelqu'un
16 |
17 | { word: 'auquel', out: ['à', 'lequel'] },
18 | { word: 'auxquels', out: ['à', 'lesquels'] },
19 | { word: 'auxquelles', out: ['à', 'lesquelles'] },
20 | { word: 'duquel', out: ['de', 'lequel'] },
21 | { word: 'desquels', out: ['de', 'lesquels'] },
22 | { word: 'desquelles', out: ['de', 'lesquelles'] },
23 | ]
--------------------------------------------------------------------------------
/learn/wiktionary/index.js:
--------------------------------------------------------------------------------
1 | // import wtf from 'wtf_wikipedia'
2 | import rp from 'request-promise';
3 | import $ from 'cheerio';
4 | import list from './list.js'
5 |
6 |
7 | const doit = async function (word) {
8 | const url = `https://fr.wiktionary.org/wiki/${encodeURIComponent(word)}`;
9 | return rp(url)
10 | .then(function (html) {
11 | //success!
12 | let all = []
13 | let r = $('.flextable-fr-mfsp :first a ', html)
14 | r.each(function (i, o) {
15 | let str = $(this).text()
16 | if (!str.match(/^\\/)) {
17 | all.push(str)
18 | }
19 | })
20 | return all
21 | })
22 | .catch(function (err) {
23 | console.log('error')
24 | });
25 |
26 | }
27 |
28 | ; (async () => {
29 | let all = {}
30 |
31 | let keys = Object.keys(list)
32 | for (let i = 0; i < keys.length; i += 1) {
33 |
34 | let w = keys[i]
35 | all[w] = await doit(w)
36 | }
37 | console.log(JSON.stringify(all, null, 2))
38 |
39 | })()
--------------------------------------------------------------------------------
/src/03-three/topics/api.js:
--------------------------------------------------------------------------------
1 | const findPeople = function () {
2 | let m = this.match('#Honorific+? #Person+')
3 | return m
4 | }
5 |
6 | const findOrgs = function () {
7 | return this.match('#Organization+')
8 | }
9 |
10 | const findPlaces = function () {
11 | let m = this.match('(#Place|#Address)+')
12 |
13 | // split all commas except for 'paris, france'
14 | let splits = m.match('@hasComma')
15 | splits = splits.filter(c => {
16 | // split 'europe, china'
17 | if (c.has('(asia|africa|europe|america)$')) {
18 | return true
19 | }
20 | // don't split 'paris, france'
21 | if (c.has('(#City|#Region|#ProperNoun)$') && c.after('^(#Country|#Region)').found) {
22 | return false
23 | }
24 | return true
25 | })
26 | m = m.splitAfter(splits)
27 | return m
28 | }
29 |
30 | const api = function (View) {
31 | View.prototype.people = findPeople
32 | View.prototype.organizations = findOrgs
33 | View.prototype.places = findPlaces
34 | }
35 |
36 | export default api
37 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/noun-plurals.js:
--------------------------------------------------------------------------------
1 | const exceptions = new Set([
2 | 'bras',
3 | 'bus',
4 | 'corps',
5 | 'discours',
6 | 'fils',
7 | 'héros',
8 | 'os',
9 | 'pays',
10 | 'procès',
11 | 'poids',
12 | 'repas',
13 | 'sens',
14 | 'succès',
15 | ])
16 | // guess a plural/singular tag each noun
17 | const nounPlurals = function (terms, i, world) {
18 | let setTag = world.methods.one.setTag
19 | let term = terms[i]
20 | let tags = term.tags
21 | let str = term.implicit || term.normal || term.text || ''
22 | if (tags.has('Noun')) {
23 | if (tags.has('Pronoun') || tags.has('ProperNoun') || tags.has('Uncountable') || tags.has('Date')) {
24 | return null
25 | }
26 | if (exceptions.has(str)) {
27 | return setTag([term], 'Singular', world, false, '3-plural-guess')
28 | }
29 | if (str.endsWith('s') && !str.endsWith('is')) {
30 | return setTag([term], 'PluralNoun', world, false, '3-plural-guess')
31 | }
32 | }
33 | return null
34 | }
35 | export default nounPlurals
--------------------------------------------------------------------------------
/data/lexicon/misc/prepositions.js:
--------------------------------------------------------------------------------
1 | // these need some work
2 | export default [
3 | 'lorsque',
4 | 'puisque',
5 | 'lorsqu',
6 | 'puisqu',
7 | 'quoiqu',
8 | 'pourquoi',
9 | 'quelqu',
10 | 'quoique',
11 |
12 | 'y',// -?
13 |
14 | 'de', 'du', 'des',
15 | 'a',
16 | 'd',
17 | 'en',
18 | 'dans',
19 | 'pour',
20 | 'par',
21 | 'sur',
22 | 'avec',
23 | 'apres',
24 | 'selon',
25 | 'depuis',
26 | 'contre',
27 | 'entre',
28 | 'comme',
29 | 'avant',
30 | 'sans',
31 | 'devant',
32 | 'sous',
33 | 'vers',
34 | 'pendant',
35 | 'afin',
36 | 'des',
37 | 'durant',
38 | 'parmi',
39 | 'pres',
40 | 'malgre',
41 | 'chez',
42 | 'aupres',
43 | "jusqu'",
44 | 'concernant',
45 | 'a',
46 | 'à',
47 | 'derriere',
48 | 'hors',
49 | 'outre',
50 | 'envers',
51 | 'sauf',
52 | 'via',
53 | 'jusque',
54 | 'suivant',
55 | 'hormis',
56 | 'environ',
57 | 'par dessus',
58 | 'excepte',
59 | "quelqu'",
60 | 'because',
61 | 'grace',
62 | 'courant',
63 | 'au dessus',
64 | 'voici',
65 | ]
66 |
--------------------------------------------------------------------------------
/learn/verbs/old.js:
--------------------------------------------------------------------------------
1 | import verbs from './data.js'
2 | import { learn, test, validate, compress } from 'suffix-thumb'
3 | const hasPipe = /[\|\[]/
4 |
5 | let index = {
6 | 'je': 0, // "achète",
7 | 'tu': 1, // "achètes",
8 | 'il': 2, // "achète",
9 | 'nous': 3, // "achetons",
10 | 'vous': 4, // "achetez",
11 | 'ils': 5, // "achètent"
12 | }
13 |
14 | const doModel = function (tense, form) {
15 | let pairs = []
16 | const i = index[form]
17 | Object.keys(verbs).forEach(inf => {
18 | let want = verbs[inf][tense][i]
19 | if (want && !hasPipe.test(want)) {
20 | pairs.push([inf, want])
21 | }
22 | })
23 | pairs = validate(pairs)
24 | // test(pairs)
25 | const model = learn(pairs)
26 | return model
27 | }
28 |
29 |
30 | let tense = "Présent"
31 | const models = {
32 | je: doModel(tense, 'je'),
33 | tu: doModel(tense, 'tu'),
34 | il: doModel(tense, 'il'),
35 | nous: doModel(tense, 'nous'),
36 | vous: doModel(tense, 'vous'),
37 | ils: doModel(tense, 'ils'),
38 | }
39 |
40 | // let model = doModel("Présent", 'je')
41 | // model = compress(model)
42 | console.log(JSON.stringify(models, null, 2))
43 |
--------------------------------------------------------------------------------
/data/lexicon/numbers/units.js:
--------------------------------------------------------------------------------
1 | export default [
2 | '°c',
3 | 'celsius',
4 | '°f',
5 | 'fahrenheit',
6 | 'fahrenheits',
7 | 'kelvin',
8 | 'kelvins',
9 | '°n',
10 | 'm³',
11 |
12 | 'hertz',
13 | 'km/h',
14 | 'byte',
15 | 'bytes',
16 | // 'kb',
17 | 'kilobyte',
18 | 'kilobytes',
19 | // 'mb',
20 | 'megabyte',
21 | 'megabytes',
22 | // 'gb',
23 | 'gigabyte',
24 | 'gigabytes',
25 | // 'tb',
26 | 'terabyte',
27 | 'terabytes',
28 | 'petabyte',
29 | 'petabytes',
30 | 'eb',
31 | 'exabyte',
32 | 'exabytes',
33 | 'zb',
34 | 'zettabyte',
35 | 'zettabytes',
36 | 'yb',
37 | 'yottabyte',
38 | 'yottabytes',
39 | 'joule',
40 | 'joules',
41 |
42 | 'µs',
43 |
44 | 'percent',
45 |
46 |
47 | 'gramme',
48 | 'grammes',
49 | 'kilogramme',
50 | 'kilogrammes',
51 | 'kilo',
52 | 'kilos',
53 | 'litre',
54 | 'litres',
55 | 'millilitre',
56 | 'millilitres',
57 | 'centimètre',
58 | 'centimètres',
59 | 'mètre',
60 | 'mètres',
61 | 'kilomètre',
62 | 'km',
63 | 'kms',
64 | // pied
65 | 'pouce',
66 | 'pouces',
67 | 'mile',
68 | 'miles'
69 | // livre
70 | ]
71 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2019 Spencer Kelly
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/date/data.js:
--------------------------------------------------------------------------------
1 | const months = {
2 | 'janvier': 1, // January
3 | 'février': 2, // February
4 | 'fevrier': 2, // February
5 | 'mars': 3, // March
6 | 'avril': 4, // April
7 | 'mai': 5, // May
8 | 'juin': 6, // June
9 | 'juillet': 7, // July
10 | 'aout': 8, // August
11 | 'septembre': 9, //September
12 | 'octobre': 10, // October
13 | 'novembre': 11, // November
14 | 'décembre': 12, // December
15 | 'decembre': 12, // December
16 | 'jan': 1,
17 | 'fév': 2,
18 | 'fev': 2,
19 | 'mar': 3,
20 | 'avr': 4,
21 | 'aou': 8,
22 | 'sep': 9,
23 | 'sept': 9, //hmm
24 | 'oct': 10,
25 | 'nov': 11,
26 | 'déc': 12,
27 | 'janv': 1,
28 | 'févr': 2,
29 | 'fevr': 2,
30 | 'juil': 7,
31 | 'juill': 7,
32 | }
33 |
34 | const days = {
35 | 'lundi': 1, // Monday
36 | 'mardi': 2, // Tuesday
37 | 'mercredi': 3, // Wednesday
38 | 'jeudi': 4, // Thursday
39 | 'vendredi': 5, // Friday
40 | 'samedi': 6, // Saturday
41 | 'dimanche': 0, // Sunday
42 | 'lun': 1,
43 | 'mar': 2,
44 | 'mer': 3,
45 | 'jeu': 4,
46 | 'ven': 5,
47 | 'sam': 6,
48 | 'dim': 0,
49 | }
50 | export { months, days }
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/2nd-pass/neighbours.js:
--------------------------------------------------------------------------------
1 | const hasBefore = {
2 | la: 'FemaleNoun',
3 | une: 'FemaleNoun',
4 | un: 'MaleNoun',
5 | du: 'MaleNoun',
6 | au: 'MaleNoun',
7 | des: 'PluralNoun',
8 | aux: 'PluralNoun',
9 | de: 'Noun',
10 | // modals
11 | dois: 'Verb',
12 | doit: 'Verb',
13 | devons: 'Verb',
14 | devez: 'Verb',
15 | doivent: 'Verb',
16 |
17 | peux: 'Verb',
18 | peut: 'Verb',
19 | pouvons: 'Verb',
20 | pouvez: 'Verb',
21 | peuvent: 'Verb',
22 | // (conditional)
23 | pouvait: 'Verb',
24 | pourrait: 'Verb',
25 | pourrais: 'Verb',
26 | pourrions: 'Verb',
27 | pourriez: 'Verb',
28 | pourraient: 'Verb',
29 |
30 | //
31 | avoir: 'Noun',
32 | pas: 'Verb' //maybe
33 | }
34 |
35 | const tagNeighbours = function (terms, i, world) {
36 | let setTag = world.methods.one.setTag
37 | if (terms[i - 1]) {
38 | let lastStr = terms[i - 1].normal
39 | if (terms[i].tags.size === 0 && hasBefore.hasOwnProperty(lastStr)) {
40 | setTag([terms[i]], hasBefore[lastStr], world, false, 'neighbour')
41 | return true
42 | }
43 | }
44 | return null
45 | }
46 | export default tagNeighbours
--------------------------------------------------------------------------------
/plugins/dates/src/api.js:
--------------------------------------------------------------------------------
1 | import find from './find.js'
2 | import parse from './phrase/index.js'
3 | import spacetime from 'spacetime'
4 | import toJson from './toJson.js'
5 |
6 | export const getNth = (doc, n) => (typeof n === 'number' ? doc.eq(n) : doc)
7 |
8 |
9 | const api = function (View) {
10 | class Dates extends View {
11 | constructor(document, pointer, groups, opts = {}) {
12 | super(document, pointer, groups)
13 | this.viewType = 'Dates'
14 | this.opts = opts || {}
15 | }
16 | parse(n) {
17 | return getNth(this, n).map(m => toJson(parse(m.this.opts)))
18 | }
19 | json(opts, n) {
20 | let m = getNth(this, n)
21 | let arr = m.map(vb => {
22 | let out = vb.toView().json(opts)[0] || {}
23 | let res = parse(vb, this.opts)
24 | out.dates = toJson(res)
25 | return out
26 | }, [])
27 | return arr
28 | }
29 | }
30 |
31 | View.prototype.dates = function (opts = {}) {
32 | opts.today = spacetime(opts.today, opts.timezone)
33 | let m = find(this, opts)
34 | return new Dates(this.document, m.pointer, null, opts)
35 | }
36 | }
37 | export default api
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/1st-pass/regex.js:
--------------------------------------------------------------------------------
1 | const hasApostrophe = /['‘’‛‵′`´]/
2 |
3 | // normal regexes
4 | const doRegs = function (str, regs) {
5 | for (let i = 0; i < regs.length; i += 1) {
6 | if (regs[i][0].test(str) === true) {
7 | return regs[i]
8 | }
9 | }
10 | return null
11 | }
12 |
13 | const checkRegex = function (terms, i, world) {
14 | let setTag = world.methods.one.setTag
15 | let term = terms[i]
16 | let { regexText, regexNormal, regexNumbers } = world.model.two
17 | let normal = term.machine || term.normal
18 | let text = term.text
19 | // keep dangling apostrophe?
20 | if (hasApostrophe.test(term.post) && !hasApostrophe.test(term.pre)) {
21 | text += term.post.trim()
22 | }
23 | let arr = doRegs(text, regexText) || doRegs(normal, regexNormal)
24 | // hide a bunch of number regexes behind this one
25 | if (!arr && /[0-9]/.test(normal)) {
26 | arr = doRegs(normal, regexNumbers)
27 | }
28 | if (arr) {
29 | setTag([term], arr[1], world, false, `2-regex- '${arr[2] || arr[0]}'`)
30 | term.confidence = 0.6
31 | return true
32 | }
33 | return null
34 | }
35 | export default checkRegex
36 |
--------------------------------------------------------------------------------
/src/01-one/lexicon/methods/adjective/index.js:
--------------------------------------------------------------------------------
1 | import { convert, reverse } from 'suffix-thumb'
2 | import model from '../model.js'
3 |
4 | let fRev = reverse(model.adjective.female)
5 | let pRev = reverse(model.adjective.plural)
6 | let fpRev = reverse(model.adjective.femalePlural)
7 |
8 | const toFemale = (str) => convert(str, model.adjective.female)
9 | const toPlural = (str) => convert(str, model.adjective.plural)
10 | const toFemalePlural = (str) => convert(str, model.adjective.femalePlural)
11 | const fromFemale = (str) => convert(str, fRev)
12 | const fromPlural = (str) => convert(str, pRev)
13 | const fromFemalePlural = (str) => convert(str, fpRev)
14 |
15 | const conjugate = function (str) {
16 | return {
17 | male: str,
18 | female: toFemale(str),
19 | plural: toPlural(str),
20 | femalePlural: toFemalePlural(str),
21 | }
22 | }
23 |
24 | const all = (str) => {
25 | let arr = Object.values(conjugate(str))
26 | return arr.filter(s => s)
27 | }
28 |
29 | export default {
30 | all,
31 | conjugate,
32 | toFemale,
33 | toPlural,
34 | toFemalePlural,
35 | fromFemale,
36 | fromPlural,
37 | fromFemalePlural,
38 | }
39 | // console.log(conjugate('frais'))
--------------------------------------------------------------------------------
/data/lexicon/misc/expressions.js:
--------------------------------------------------------------------------------
1 | export default [
2 | 'a la',
3 | 'ah',
4 | 'ahem',
5 | 'argh',
6 | 'bah',
7 | 'boo',
8 | 'bye',
9 | 'dammit',
10 | 'damn',
11 | 'damnit',
12 | 'dang',
13 | 'duh',
14 | 'eek',
15 | 'eep',
16 | 'eh',
17 | 'et cetera',
18 | 'eww',
19 | 'fuck',
20 | 'gah',
21 | 'gee',
22 | 'golly',
23 | 'goodbye',
24 | 'grr',
25 | 'haha',
26 | 'hahaha',
27 | 'hai',
28 | 'hee',
29 | 'hell',
30 | 'hello',
31 | 'hey',
32 | 'hi',
33 | 'hmm',
34 | 'holy moly',
35 | 'holy',
36 | 'hurrah',
37 | 'lmao',
38 | 'lmfao',
39 | 'lol',
40 | 'lols',
41 | 'meh',
42 | 'mmm',
43 | 'nah',
44 | 'nope',
45 | 'oh',
46 | 'ohh',
47 | 'ooh',
48 | 'ooo',
49 | 'oops',
50 | 'ow',
51 | 'oy',
52 | 'pff',
53 | 'phew',
54 | 'please',
55 | 'plz',
56 | 'psst',
57 | 'sheesh',
58 | 'shhh',
59 | 'shit',
60 | 'tsk',
61 | 'ugh',
62 | 'uh huh',
63 | 'uh oh',
64 | 'uh',
65 | 'uhh',
66 | 'uhm',
67 | 'voila',
68 | 'whee',
69 | 'whew',
70 | 'whoa',
71 | 'wow',
72 | 'wtaf',
73 | 'wtf',
74 | 'ya',
75 | 'yaa',
76 | 'yahoo',
77 | 'yay',
78 | 'yeah',
79 | 'yuck',
80 | 'yup',
81 | "d'oh",
82 | ]
83 |
--------------------------------------------------------------------------------
/.github/workflows/build-and-test.yml:
--------------------------------------------------------------------------------
1 | name: Build and test
2 |
3 | on: [pull_request]
4 |
5 | jobs:
6 | build-and-test:
7 | runs-on: ${{ matrix.os }}
8 |
9 | strategy:
10 | matrix:
11 | node-version: [14.x, 18.x]
12 | os: [ubuntu-latest, windows-latest]
13 |
14 | steps:
15 | - uses: actions/checkout@v3
16 |
17 | - name: use node.js ${{ matrix.node-version }}
18 | uses: actions/setup-node@v3
19 | with:
20 | node-version: ${{ matrix.node-version }}
21 |
22 | - name: cache dependencies
23 | uses: actions/cache@v3
24 | with:
25 | path: ~/.npm
26 | key: ${{ runner.os }}-npm-${{ matrix.node-version }}-${{ hashFiles('package-lock.json') }}
27 | restore-keys: |
28 | ${{ runner.os }}-npm-${{ matrix.node-version }}-
29 | ${{ runner.os }}-npm-
30 |
31 | - name: install
32 | run: |
33 | npm ci
34 |
35 | - name: static checks
36 | run: |
37 | npm run lint
38 |
39 | - name: build
40 | run: |
41 | npm run build
42 |
43 | - name: test
44 | run: |
45 | npm run test
46 | npm run testb
47 |
--------------------------------------------------------------------------------
/learn/giga/getList.js:
--------------------------------------------------------------------------------
1 | import { forEachSync } from './_giga.js'
2 | import doSentences from './french.js'
3 | import fs from 'fs'
4 |
5 | let ids = []
6 | for (let i = 1; i <= 10; i += 1) {
7 | let str = String(i).padStart(4, '0')
8 | ids.push(str)
9 | }
10 | // ids = ['0004']
11 |
12 | let list = []
13 | const tag = 'NOM'
14 |
15 | const doBoth = function (both) {
16 | let terms = both.fr
17 | terms.forEach((term, i) => {
18 | if (i === 0) {
19 | return
20 | }
21 | if (term['$'].pos === tag) {
22 | let last = terms[i - 1]['$text'].toLowerCase()
23 | if (last === 'le' || last === 'un') {
24 | let w = term['$text']
25 | let inf = term['$'].lem
26 | // console.log(last, w, inf)
27 | if (w && inf) {
28 | w = w.toLowerCase().trim()
29 | inf = inf.toLowerCase().trim()
30 | list.push(inf)
31 | }
32 | }
33 | }
34 | })
35 | }
36 |
37 | await forEachSync(ids, async id => {
38 | try {
39 | console.log(`\ndoing ${id}:\n`)
40 | await doSentences(id, doBoth)
41 | } catch (e) {
42 | console.log(e)
43 | }
44 | })
45 | console.log('done')
46 | fs.writeFileSync('./pairs.js', 'export default ' + JSON.stringify(list))
47 |
--------------------------------------------------------------------------------
/src/03-three/verbs/api/parse.js:
--------------------------------------------------------------------------------
1 | import getAdverbs from './adverbs.js'
2 |
3 | const getAuxiliary = function (vb, root) {
4 | let parts = vb.splitBefore(root)
5 | if (parts.length <= 1) {
6 | return vb.none()
7 | }
8 | let aux = parts.eq(0)
9 | aux = aux.not('(#Adverb|#Negative|#Prefix)')
10 | return aux
11 | }
12 |
13 | const getNegative = function (vb) {
14 | return vb.match('#Negative')
15 | }
16 |
17 | // pull-apart phrasal-verb into verb-particle
18 | // const getPhrasal = function (root) {
19 | // let particle = root.match('#Particle$')
20 | // return {
21 | // verb: root.not(particle),
22 | // particle: particle,
23 | // }
24 | // }
25 |
26 | const getRoot = function (view) {
27 | view.compute('root')
28 | let str = view.text('root')
29 | return str
30 | }
31 |
32 | const parseVerb = function (view) {
33 | let vb = view.clone()
34 | // vb.contractions().expand()
35 | const root = getRoot(vb)
36 | let res = {
37 | root: root,
38 | prefix: vb.match('#Prefix'),
39 | adverbs: getAdverbs(vb, root),
40 | auxiliary: getAuxiliary(vb, root),
41 | negative: getNegative(vb),
42 | // phrasal: getPhrasal(root),
43 | }
44 | return res
45 | }
46 | export default parseVerb
47 |
--------------------------------------------------------------------------------
/learn/verbs/toPairs.js:
--------------------------------------------------------------------------------
1 | import verbs from './data.js'
2 | import scraped from '../scrape/result.js'
3 |
4 | import { learn, test, validate, compress } from 'suffix-thumb'
5 | const hasPipe = /[\|\[]/
6 |
7 | let index = {
8 | 'je': 0, // "achète",
9 | 'tu': 1, // "achètes",
10 | 'il': 2, // "achète",
11 | 'nous': 3, // "achetons",
12 | 'vous': 4, // "achetez",
13 | 'ils': 5, // "achètent"
14 | }
15 |
16 | const getPairs = function (tense) {
17 | let byWord = {}
18 | Object.keys(verbs).forEach(inf => {
19 | let words = verbs[inf][tense] || []
20 | if (words.some(str => str === '' || str.length === 1)) {
21 | return
22 | }
23 | byWord[inf] = verbs[inf][tense]
24 | })
25 | return byWord
26 | }
27 |
28 |
29 |
30 | const res = getPairs("Imparfait")
31 | Object.keys(scraped).forEach(inf => {
32 | if (res[inf]) {
33 | return
34 | }
35 | let vals = Object.values(scraped[inf]["Imperfect"])
36 | if (vals.length < 5 || vals.some(str => str === '' || str.length === 1 || str === 'le')) {
37 | return
38 | }
39 | res[inf] = vals
40 | })
41 |
42 | // let model = doModel("Présent", 'je')
43 | // model = compress(model)
44 | console.log(JSON.stringify(res, null, 2))
45 | console.log(Object.keys(res).length)
--------------------------------------------------------------------------------
/src/03-three/verbs/api/toJSON.js:
--------------------------------------------------------------------------------
1 | import parseVerb from './parse.js'
2 | // import getGrammar from './parse/grammar/index.js'
3 | // import { getTense } from './lib.js'
4 |
5 | const toArray = function (m) {
6 | if (!m || !m.isView) {
7 | return []
8 | }
9 | const opts = { normal: true, terms: false, text: false }
10 | return m.json(opts).map(s => s.normal)
11 | }
12 |
13 | const toText = function (m) {
14 | if (!m || !m.isView) {
15 | return ''
16 | }
17 | return m.text('normal')
18 | }
19 |
20 | // const toInfinitive = function (root) {
21 | // const { verbToInfinitive } = root.methods.two.transform
22 | // let str = root.text('normal')
23 | // return verbToInfinitive(str, root.model, getTense(root))
24 | // }
25 |
26 | const toJSON = function (vb) {
27 | let parsed = parseVerb(vb)
28 | vb = vb.clone().toView()
29 | // const info = getGrammar(vb, parsed)
30 | return {
31 | root: parsed.root,
32 | preAdverbs: toArray(parsed.adverbs.pre),
33 | postAdverbs: toArray(parsed.adverbs.post),
34 | auxiliary: toText(parsed.auxiliary),
35 | negative: parsed.negative.found,
36 | prefix: toText(parsed.prefix),
37 | infinitive: parsed.root,
38 | // grammar: info,
39 | }
40 | }
41 | export default toJSON
42 |
--------------------------------------------------------------------------------
/data/lexicon/places/places.js:
--------------------------------------------------------------------------------
1 | export default [
2 | //some of the busiest airports in the world from
3 | //https://www.world-airport-codes.com/world-top-30-airports.html
4 | 'ams',
5 | 'atl',
6 | 'bcn',
7 | 'bkk',
8 | 'cdg',
9 | 'cgk',
10 | 'clt',
11 | 'den',
12 | 'dfw',
13 | 'dxb',
14 | 'fco',
15 | 'fra',
16 | 'hkg',
17 | 'hnd',
18 | 'iax',
19 | 'icn',
20 | 'ist',
21 | 'jfk',
22 | 'kul',
23 | 'las',
24 | 'lax',
25 | 'lgw',
26 | 'lhr',
27 | 'mco',
28 | 'muc',
29 | 'ord',
30 | 'pek',
31 | 'phl',
32 | 'phx',
33 | 'sfo',
34 | 'syd',
35 | 'yyz',
36 |
37 | 'antarctic ocean',
38 | 'arctic ocean',
39 | 'atlantic ocean',
40 | 'everglades',
41 | 'great britain',
42 | 'great lakes',
43 | 'indian ocean',
44 | 'new england',
45 | 'pacific ocean',
46 |
47 | //continents
48 | 'africa',
49 | 'europe',
50 | 'americas',
51 | 'asia',
52 |
53 | //some notable neighbourhoods (just #Place)
54 | 'midtown',
55 | 'downtown',
56 | 'uptown',
57 | 'the bronx',
58 | 'brooklyn',
59 | 'manhattan',
60 | 'greenwich',
61 | 'soho',
62 | 'harlem',
63 | 'chinatown',
64 | 'the hamptons',
65 | 'beverly hills',
66 | 'bel air',
67 | 'malibu',
68 | 'gay village',
69 | 'sunderland',
70 | ]
71 |
--------------------------------------------------------------------------------
/data/models/index.js:
--------------------------------------------------------------------------------
1 | import noun from './noun/plurals.js'
2 | import adjective from './adjective/index.js'
3 |
4 | import futureTense from './verb/future-tense.js'
5 | import imperfect from './verb/imperfect.js'
6 | import pastParticiple from './verb/past-participle.js'
7 | import presentTense from './verb/present-tense.js'
8 |
9 | const vbOrder = ['je', 'tu', 'il', 'nous', 'vous', 'ils']
10 | const nOrder = ['plural']
11 | const adjOrder = ['female', 'plural', 'femalePlural']
12 | const todo = {
13 | noun: { data: noun, keys: nOrder },
14 | adjective: { data: adjective, keys: adjOrder },
15 | futureTense: { data: futureTense, keys: vbOrder },
16 | imperfect: { data: imperfect, keys: vbOrder },
17 | pastParticiple: { data: pastParticiple, keys: ['prt'] },
18 | presentTense: { data: presentTense, keys: vbOrder },
19 | }
20 |
21 | // turn our conjugation data into word-pairs
22 | let model = {}
23 | Object.keys(todo).forEach(k => {
24 | model[k] = {}
25 | let { data, keys } = todo[k]
26 | keys.forEach((form, i) => {
27 | let pairs = []
28 | Object.keys(data).forEach(inf => {
29 | pairs.push([inf, data[inf][i]])
30 | })
31 | model[k][form] = pairs
32 | // console.log(k, form, pairs.length)
33 | })
34 | })
35 |
36 | export default model
37 |
--------------------------------------------------------------------------------
/data/lexicon/people/people.js:
--------------------------------------------------------------------------------
1 | export default [
2 | //famous people with names that are hard to recognize independendtly
3 | //male
4 | 'hitler',
5 | 'ronaldo',
6 | 'ashton kutcher',
7 | 'barack obama',
8 | 'cardinal wolsey',
9 | 'carson palmer',
10 | 'denzel washington',
11 | 'dick wolf',
12 | 'emeril lagasse',
13 | 'hulk hogan',
14 | 'kanye west',
15 | 'kiefer sutherland',
16 | 'kobe bryant',
17 | 'lebron james',
18 | 'messiaen',
19 | 'mitt romney',
20 | 'mubarek',
21 | 'ray romano',
22 | 'rod stewart',
23 | 'ronaldinho',
24 | 'rush limbaugh',
25 | 'saddam hussain',
26 | 'slobodan milosevic',
27 | 'tiger woods',
28 | 'valentino rossi',
29 | 'van gogh',
30 |
31 | //female
32 | 'halle berry',
33 | 'jk rowling',
34 | 'oprah winfrey',
35 | 'paris hilton',
36 | 'reese witherspoon',
37 | 'scarlett johansson',
38 | 'theresa may',
39 | 'tyra banks',
40 | 'virgin mary',
41 |
42 | //sometimes firstname, sometimes lastname
43 | 'brock',
44 | 'carson',
45 | 'clinton',
46 | 'cruz',
47 | 'dalton',
48 | 'dante',
49 | 'effie',
50 | 'ezekiel',
51 | 'gaston',
52 | 'inez',
53 | 'jaime',
54 | 'jefferson',
55 | 'lee',
56 | 'nettie',
57 | 'ora',
58 | 'palmer',
59 | 'piper',
60 | 'sung',
61 | ]
62 |
--------------------------------------------------------------------------------
/learn/wikinews/getLexicon.js:
--------------------------------------------------------------------------------
1 | let lines = require('./parse')
2 | // lines = lines.slice(0, 300)
3 |
4 | let tags = {}
5 | lines.forEach((s) => {
6 | s.forEach((w) => {
7 | tags[w.tag] = tags[w.tag] || {}
8 | let word = w.word.toLowerCase()
9 | tags[w.tag][word] = tags[w.tag][word] || 0
10 | tags[w.tag][word] += 1
11 | })
12 | })
13 |
14 | // 'P+D': 241,
15 | // ADJ: 719,
16 | // ADV: 311,
17 | // CC: 172,
18 | // CLO: 32,
19 | // CLR: 53,
20 | // CLS: 88,
21 | // CS: 90,
22 | // DET: 1353,
23 | // ET: 136,
24 |
25 | // nouns:
26 | // NC: 1877,
27 | // NPP: 493,
28 | // P: 1242,
29 | // PREF: 8,
30 |
31 | // PRO: 43, //pronoun
32 | // PROREL: 89, //relative pronoun
33 | // U: 100,
34 |
35 | // V: 509,
36 | // VINF: 140,
37 | // VPP: 402, //PastTense
38 | // VPR: 61, //Gerund
39 | // VS: 10, //presentTense
40 |
41 | // VPP: 'PastTense',
42 | // VPR: 'Gerund',
43 | // VS: 'V',
44 |
45 | const top = function (obj) {
46 | let keys = Object.keys(obj).sort((a, b) => {
47 | if (obj[a] > obj[b]) {
48 | return -1
49 | } else if (obj[a] < obj[b]) {
50 | return 1
51 | }
52 | return 0
53 | })
54 | let arr = keys.filter((k) => {
55 | return obj[k] > 1
56 | })
57 | return arr
58 | }
59 |
60 | console.log(JSON.stringify(top(tags['ADJ']), null, 2))
61 |
--------------------------------------------------------------------------------
/learn/verbs/single-pairs.js:
--------------------------------------------------------------------------------
1 | import verbs from './data.js'
2 | import scraped from '../scrape/result.js'
3 |
4 | import { learn, test, validate, compress } from 'suffix-thumb'
5 | const hasPipe = /[\|\[]/
6 |
7 | let index = {
8 | 'je': 0, // "achète",
9 | 'tu': 1, // "achètes",
10 | 'il': 2, // "achète",
11 | 'nous': 3, // "achetons",
12 | 'vous': 4, // "achetez",
13 | 'ils': 5, // "achètent"
14 | }
15 |
16 | const getPairs = function (tense) {
17 | let byWord = {}
18 | Object.keys(verbs).forEach(inf => {
19 | let words = verbs[inf][tense] || []
20 | if (words.length === 0 || words.some(str => str === '' || str.length === 1)) {
21 | return
22 | }
23 | byWord[inf] = words[0]
24 | })
25 | return byWord
26 | }
27 |
28 |
29 |
30 | const res = getPairs("Participe Passé")
31 | Object.keys(scraped).forEach(inf => {
32 | if (res[inf]) {
33 | return
34 | }
35 | let vals = Object.values(scraped[inf]["Present Perfect"])
36 | if (vals.length < 5 || vals.some(str => str === '' || str.length === 1 || str === 'le')) {
37 | return
38 | }
39 | res[inf] = vals[0].replace(/^(a|ai) /, '')
40 | })
41 |
42 | // let model = doModel("Présent", 'je')
43 | // model = compress(model)
44 | console.log(JSON.stringify(res, null, 2))
45 | console.log(Object.keys(res).length)
--------------------------------------------------------------------------------
/src/03-three/verbs/api/find.js:
--------------------------------------------------------------------------------
1 | const findVerbs = function (doc) {
2 | let m = doc.match('')
3 |
4 | m = m.splitAfter('@hasComma')
5 |
6 | // the reason he will is ...
7 | // all i do is talk
8 | m = m.splitAfter('[(do|did|am|was|is|will)] (is|was)', 0)
9 | // m = m.splitAfter('[(do|did|am|was|is|will)] #PresentTense', 0)
10 |
11 | // cool
12 |
13 | // like being pampered
14 | m = m.splitBefore('(#Verb && !#Copula) [being] #Verb', 0)
15 | // like to be pampered
16 | m = m.splitBefore('#Verb [to be] #Verb', 0)
17 |
18 | // implicit conjugation - 'help fix'
19 |
20 | m = m.splitAfter('[help] #PresentTense', 0)
21 | // what i can sell is..
22 | m = m.splitBefore('(#PresentTense|#PastTense) [#Copula]$', 0)
23 | // what i can sell will be
24 | m = m.splitBefore('(#PresentTense|#PastTense) [will be]$', 0)
25 |
26 | // professes love
27 | let toVerbs = m.match('(#PresentTense|#PastTense) #Infinitive')
28 | if (toVerbs.found && !toVerbs.has('^go')) {
29 | m = m.splitBefore('(#PresentTense|#PastTense) [#Infinitive]', 0)
30 | }
31 | // 'allow yourself'
32 | m = m.not('#Reflexive$')
33 | //ensure there's actually a verb
34 | m = m.if('#Verb')
35 | // the reason he will is ...
36 | // ensure it's not two verbs
37 | return m
38 | }
39 | export default findVerbs
40 |
--------------------------------------------------------------------------------
/learn/verbs/learn.js:
--------------------------------------------------------------------------------
1 | let verbs = require('./data')
2 |
3 | let pairs = []
4 | Object.keys(verbs).forEach((inf) => {
5 | let want = verbs[inf]['Présent'][0]
6 | if (want) {
7 | pairs.push([inf, want])
8 | }
9 | })
10 |
11 | // order matters
12 | const regs = [
13 | [/ébrer$/, 'èbre'],
14 | [/eter$/, 'ette'],
15 | [/er$/, 'e'],
16 |
17 | [/dre$/, 'ds'],
18 | [/ure$/, 'us'],
19 | [/ure$/, 'us'],
20 | [/tre$/, 's'],
21 | [/ire$/, 'is'],
22 | [/ore$/, 'os'],
23 | [/cre$/, 'cs'],
24 |
25 | [/llir$/, 'lle'],
26 | [/voir$/, 'vois'],
27 | [/tir$/, 's'],
28 | [/ir$/, 's'],
29 | ]
30 |
31 | const toJe = function (str) {
32 | // try each replacement
33 | for (let i = 0; i < regs.length; i += 1) {
34 | let reg = regs[i][0]
35 | if (str.match(reg)) {
36 | str = str.replace(reg, regs[i][1])
37 | // for some reason, this seems to happen
38 | str = str.replace(/î/, 'i')
39 | return str
40 | }
41 | }
42 | // otherwise...
43 | str += 's'
44 | return str
45 | }
46 |
47 | let count = 0
48 | pairs.forEach((a) => {
49 | let je = toJe(a[0])
50 | if (je === a[1]) {
51 | count += 1
52 | } else {
53 | if (a[0].endsWith('oir')) {
54 | console.log(`${a[0]} ~${je}~ want:(${a[1]})`)
55 | }
56 | }
57 | })
58 |
59 | console.log(count / pairs.length)
60 |
--------------------------------------------------------------------------------
/src/03-three/numbers/format/index.js:
--------------------------------------------------------------------------------
1 | import toText from './toText.js'
2 | import { toOrdinal } from '../parse/_data.js'
3 |
4 | const makeSuffix = function (obj) {
5 | return {
6 | prefix: obj.prefix || '',
7 | suffix: obj.suffix || '',
8 | }
9 | }
10 |
11 | const formatNumber = function (parsed, fmt) {
12 | let { prefix, suffix } = makeSuffix(parsed)
13 | if (fmt === 'TextOrdinal') {
14 | let words = toText(parsed.num)
15 | let last = words[words.length - 1]
16 | words[words.length - 1] = toOrdinal[last]
17 | let num = words.join(' ')
18 | return `${prefix}${num}${suffix}`
19 | }
20 | if (fmt === 'TextCardinal') {
21 | let num = toText(parsed.num).join(' ')
22 | return `${prefix}${num}${suffix}`
23 | }
24 | // numeric formats
25 | // '55e'
26 | if (fmt === 'Ordinal') {
27 | let str = String(parsed.num)
28 | let last = str.slice(str.length - 1, str.length)
29 | if (last === '1') {
30 | let num = str + 'er'
31 | return `${prefix}${num}${suffix}`
32 | }
33 | let num = str + 'e'
34 | return `${prefix}${num}${suffix}`
35 | }
36 | if (fmt === 'Cardinal') {
37 | let num = String(parsed.num)
38 | return `${prefix}${num}${suffix}`
39 | }
40 | let num = String(parsed.num || '')
41 | return `${prefix}${num}${suffix}`
42 | }
43 | export default formatNumber
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/2nd-pass/suffix-lookup.js:
--------------------------------------------------------------------------------
1 |
2 | //sweep-through all suffixes
3 | const suffixLoop = function (str = '', suffixes = []) {
4 | const len = str.length
5 | let max = 7
6 | if (len <= max) {
7 | max = len - 1
8 | }
9 | for (let i = max; i > 1; i -= 1) {
10 | let suffix = str.substr(len - i, len)
11 | if (suffixes[suffix.length].hasOwnProperty(suffix) === true) {
12 | // console.log(suffix)
13 | let tag = suffixes[suffix.length][suffix]
14 | return tag
15 | }
16 | }
17 | return null
18 | }
19 |
20 | // decide tag from the ending of the word
21 | const suffixCheck = function (terms, i, world) {
22 | let setTag = world.methods.one.setTag
23 | let suffixes = world.model.two.suffixPatterns
24 | let term = terms[i]
25 | if (term.tags.size === 0) {
26 | let tag = suffixLoop(term.normal, suffixes)
27 | if (tag !== null) {
28 | setTag([term], tag, world, false, '2-suffix')
29 | term.confidence = 0.7
30 | return true
31 | }
32 | // try implicit form of word, too
33 | if (term.implicit) {
34 | tag = suffixLoop(term.implicit, suffixes)
35 | if (tag !== null) {
36 | setTag([term], tag, world, false, '2-implicit-suffix')
37 | term.confidence = 0.7
38 | return true
39 | }
40 | }
41 | }
42 | return null
43 | }
44 | export default suffixCheck
45 |
--------------------------------------------------------------------------------
/plugins/dates/scratch.js:
--------------------------------------------------------------------------------
1 | import nlp from '../../src/index.js'
2 | import plg from './src/plugin.js'
3 | nlp.plugin(plg)
4 | // nlp.verbose(true)
5 | let arr = [
6 | `Je peux emprunter votre voiture entre le 2 mai et le 14 juillet`,
7 | `Je peux emprunter votre voiture jusqu'au quatorze juillet`,
8 | 'entre sept et oct',
9 | `jusqu'en juin`,
10 | `jusqu'à juin`,
11 | `jusqu'à le quatorze juillet`,
12 | 'decembre 25, 2012',
13 | 'Juin 5, 2012',
14 | 'hier après-midi',
15 | '14h30 demain',
16 | 'hier après-midi',
17 | 'aujourd\'hui',
18 | 'hier soir',
19 | `Novembre 3, 2021`,
20 | // 'Novembre 3, 2021',
21 | // '12/01/2018',
22 | // '13/01/2018',
23 | // '5/2/2020',
24 | `le quatorze juillet.`,
25 | 'Mercredi 11 mars',
26 | `Le 6 avril`,
27 | `Il n'y a pas d'augmentation prévue jusqu'en 2032`,
28 | `le 3 novembre 2012`,
29 | 'je suis né le 2 septembre 1982',
30 | 'rendez-vous avant vendredi',
31 | `je t'appellerai jusqu'en septembre`,
32 | `15/12/2020`,
33 | `2020-10-02T07:10:12`,
34 | `juin 2e`,
35 | `2021-02-12`,
36 | `je suis né en juin`,
37 | `ta voiture jusqu’à lundi prochain`,
38 | `entre sept et oct`,
39 | ]
40 | let doc = nlp(arr[0]).debug()
41 |
42 | // let m = doc.match('[#Value] [#Month]')
43 | // m.debug()
44 | // m.groups().date.debug()
45 | // m.groups().month.debug()
46 |
47 | let json = doc.dates({ timezone: 'UTC', today: '2023-03-02' }).json({ terms: false })
48 | console.dir(json, { depth: 5 })
--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
1 | {
2 | "root": true,
3 | "extends": [
4 | "eslint:recommended",
5 | "plugin:regexp/recommended"
6 | ],
7 | "ignorePatterns": [
8 | "builds/*",
9 | "learn/**",
10 | "scripts/**",
11 | "plugins/dates/**"
12 | ],
13 | "env": {
14 | "es6": true,
15 | "browser": true,
16 | "node": true
17 | },
18 | "parserOptions": {
19 | "ecmaVersion": "latest",
20 | "sourceType": "module"
21 | },
22 | "rules": {
23 | "comma-dangle": [
24 | 1,
25 | "only-multiline"
26 | ],
27 | "quotes": [
28 | 0,
29 | "single",
30 | "avoid-escape"
31 | ],
32 | "max-nested-callbacks": [
33 | 1,
34 | 4
35 | ],
36 | "max-params": [
37 | 1,
38 | 5
39 | ],
40 | "consistent-return": 1,
41 | "no-bitwise": 1,
42 | "no-empty": 1,
43 | "no-console": 1,
44 | "no-duplicate-imports": 1,
45 | "no-eval": 2,
46 | "no-implied-eval": 2,
47 | "no-mixed-operators": 2,
48 | "no-multi-assign": 2,
49 | "no-nested-ternary": 1,
50 | "no-prototype-builtins": 0,
51 | "no-self-compare": 1,
52 | "no-sequences": 1,
53 | "no-shadow": 2,
54 | "no-unmodified-loop-condition": 1,
55 | "no-use-before-define": 1,
56 | "prefer-const": 0,
57 | "radix": 1,
58 | "no-unused-vars": 1,
59 | "regexp/prefer-d": 0,
60 | "regexp/prefer-w": 0,
61 | "regexp/prefer-range": 0,
62 | "regexp/no-unused-capturing-group": 0
63 | }
64 | }
--------------------------------------------------------------------------------
/data/lexicon/misc/currencies.js:
--------------------------------------------------------------------------------
1 | export default [
2 | '¢',
3 | '$',
4 | '£',
5 | '¥',
6 | '฿',
7 | '₡',
8 | '€',
9 | '₭',
10 | '₨',
11 | '﷼',
12 | 'aud',
13 | 'baht',
14 | 'bitcoin',
15 | 'bitcoins',
16 | 'cad',
17 | 'cent',
18 | 'cents',
19 | 'cny',
20 | 'denar',
21 | 'denars',
22 | 'dime',
23 | 'dimes',
24 | 'dinar',
25 | 'dinars',
26 | 'dirham',
27 | 'dirhams',
28 | 'dkk',
29 | 'dobra',
30 | 'dobras',
31 | 'dollar',
32 | 'dollars',
33 | 'eur',
34 | 'euro',
35 | 'euros',
36 | 'forint',
37 | 'forints',
38 | 'franc',
39 | 'francs',
40 | 'gbp',
41 | 'hkd',
42 | 'inr',
43 | 'jpy',
44 | 'kn',
45 | 'kr',
46 | 'nis',
47 | 'krona',
48 | 'kronas',
49 | 'krw',
50 | 'kwanza',
51 | 'kwanzas',
52 | 'kyat',
53 | 'kyats',
54 | 'lei',
55 | 'lempira',
56 | 'lempiras',
57 | 'lira',
58 | 'liras',
59 | 'pence',
60 | 'pences',
61 | 'pennies',
62 | 'penny',
63 | 'peso',
64 | 'pesos',
65 | 'pound sterling',
66 | 'pound sterlings',
67 | 'pound',
68 | 'pounds',
69 | 'riel',
70 | 'rouble',
71 | 'roubles',
72 | 'rp',
73 | 'rupee',
74 | 'rupees',
75 | 'shekel',
76 | 'shekels',
77 | 'sheqel',
78 | 'sheqels',
79 | 'shilling',
80 | 'shillings',
81 | 'sterling',
82 | 'sterlings',
83 | 'usd',
84 | 'xaf',
85 | 'xof',
86 | 'yen',
87 | 'yuan',
88 | 'yuans',
89 | 'zł',
90 | 'zloty',
91 | 'zlotys',
92 | 'ден',
93 | 'лв',
94 | 'руб',
95 | ]
96 |
--------------------------------------------------------------------------------
/src/03-three/numbers/data.js:
--------------------------------------------------------------------------------
1 | export default {
2 |
3 | ones: [
4 | [0, 'zero', 'zeroième'],
5 | [1, 'un', 'unième'],
6 | [2, 'deux', 'deuxième'],
7 | [3, 'trois', 'troisième'],
8 | [4, 'quatre', 'quatrième'],
9 | [5, 'cinq', 'cinquième'],
10 | [6, 'six', 'sixième'],
11 | [7, 'sept', 'septième'],
12 | [8, 'huit', 'huitième'],
13 | [9, 'neuf', 'neuvième'],
14 | [10, 'dix', 'dixième'],
15 | [11, 'onze', 'onzième'],
16 | [12, 'douze', 'douzième'],
17 | [13, 'treize', 'treizième'],
18 | [14, 'quatorze', 'quatorzième'],
19 | [15, 'quinze', 'quinzième'],
20 | [16, 'seize', 'seizième'],
21 | [17, 'dix sept', 'dix septième'],
22 | [18, 'dix huit', 'dix huitième'],
23 | [19, 'dix neuf', 'dix neuvième'],
24 | ],
25 | tens: [
26 | [20, 'vingt', 'vingtième'],
27 | [30, 'trente', 'trentième'],
28 | [40, 'quarante', 'quarantième'],
29 | [50, 'cinquante', 'cinquantième'],
30 | [60, 'soixante', 'soixantième'],
31 | [70, 'soixante dix', 'soixante dixième'],
32 | [80, 'quatre vingt', 'quatre vingtième'],
33 | [90, 'quatre vingt dix', 'quatre vingt dixième'],
34 | ],
35 | multiples: [
36 | [100, 'cent', 'centième'],
37 | [1000, 'mille', 'millième'],
38 | [1000000, 'million', 'millionième'],//million 1000,000
39 | [1000000000, 'milliard', 'milliardième'],//billion 1000,000,000
40 | // [1000000000000, 'mille milliards', 'mille milliardième'],//trillion 1000,000,000
41 | ]
42 |
43 | }
--------------------------------------------------------------------------------
/plugins/dates/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "fr-compromise-dates",
3 | "description": "plugin for fr-compromise",
4 | "version": "0.0.2",
5 | "author": "Spencer Kelly (http://spencermounta.in)",
6 | "main": "./src/plugin.js",
7 | "unpkg": "./builds/fr-compromise-dates.min.js",
8 | "module": "./builds/fr-compromise-dates.mjs",
9 | "type": "module",
10 | "sideEffects": false,
11 | "types": "./index.d.ts",
12 | "exports": {
13 | ".": {
14 | "import": "./src/plugin.js",
15 | "require": "./builds/fr-compromise-dates.cjs",
16 | "types": "./index.d.ts"
17 | }
18 | },
19 | "repository": {
20 | "type": "git",
21 | "url": "git://github.com/nlp-compromise/fr-compromise.git"
22 | },
23 | "homepage": "https://github.com/nlp-compromise/fr-compromise/tree/master/plugins/dates",
24 | "scripts": {
25 | "test": "tape \"./tests/**/*.test.js\" | tap-dancer --color always",
26 | "testb": "cross-env TESTENV=prod tape \"./tests/**/*.test.js\" | tap-dancer --color always",
27 | "watch": "amble ./scratch.js",
28 | "perf": "node ./scripts/perf.js",
29 | "build": "rollup -c --silent"
30 | },
31 | "files": [
32 | "builds/",
33 | "src/",
34 | "index.d.ts"
35 | ],
36 | "eslintIgnore": [
37 | "builds/*.js"
38 | ],
39 | "peerDependencies": {
40 | "fr-compromise": ">=0.2.0"
41 | },
42 | "dependencies": {
43 | "spacetime": "7.4.3",
44 | "spacetime-holiday": "0.3.0"
45 | },
46 | "license": "MIT"
47 | }
--------------------------------------------------------------------------------
/src/03-three/contractions/api.js:
--------------------------------------------------------------------------------
1 | const titleCase = /^\p{Lu}[\p{Ll}'’]/u //upercase, then lowercase
2 | // import contract from './contract.js'
3 |
4 | const toTitleCase = function (str = '') {
5 | str = str.replace(/^ *[a-z\u00C0-\u00FF]/, x => x.toUpperCase()) //TODO: support unicode
6 | return str
7 | }
8 |
9 | const api = function (View) {
10 | /** */
11 | class Contractions extends View {
12 | constructor(document, pointer, groups) {
13 | super(document, pointer, groups)
14 | this.viewType = 'Contraction'
15 | }
16 | /** i've -> 'i have' */
17 | expand() {
18 | this.docs.forEach(terms => {
19 | let isTitleCase = titleCase.test(terms[0].text)
20 | terms.forEach((t, i) => {
21 | t.text = t.implicit
22 | delete t.implicit
23 | //add whitespace
24 | if (i < terms.length - 1 && t.post === '') {
25 | t.post += ' '
26 | }
27 | // flag it as dirty
28 | t.dirty = true
29 | })
30 | // make the first word title-case?
31 | if (isTitleCase) {
32 | terms[0].text = toTitleCase(terms[0].text)
33 | }
34 | })
35 | this.compute('normal') //re-set normalized text
36 | return this
37 | }
38 | }
39 | // add fn to View
40 | View.prototype.contractions = function () {
41 | let m = this.match('@hasContraction+')
42 | return new Contractions(this.document, m.pointer)
43 | }
44 | // View.prototype.contract = contract
45 | }
46 |
47 | export default api
--------------------------------------------------------------------------------
/src/03-three/numbers/parse/index.js:
--------------------------------------------------------------------------------
1 | import fromText from './fromText.js'
2 |
3 | const fromNumber = function (m) {
4 | let str = m.text('normal').toLowerCase()
5 | str = str.replace(/(e|er)$/, '')
6 | let hasComma = false
7 | if (/,/.test(str)) {
8 | hasComma = true
9 | str = str.replace(/,/g, '')
10 | }
11 | // get prefix/suffix
12 | let arr = str.split(/([-0-9.,]*)/)
13 | let [prefix, num] = arr
14 | let suffix = arr.slice(2).join('')
15 | if (num !== '' && m.length < 2) {
16 | num = Number(num || str)
17 | //ensure that num is an actual number
18 | if (typeof num !== 'number') {
19 | num = null
20 | }
21 | // strip an ordinal off the suffix
22 | if (suffix === 'e' || suffix === 'er') {
23 | suffix = ''
24 | }
25 | }
26 | return {
27 | hasComma,
28 | prefix,
29 | num,
30 | suffix,
31 | }
32 | }
33 |
34 | const parseNumber = function (m) {
35 | let terms = m.docs[0]
36 | let num = null
37 | let prefix = ''
38 | let suffix = ''
39 | let hasComma = false
40 | let isText = m.has('#TextValue')
41 | if (isText) {
42 | num = fromText(terms)
43 | } else {
44 | let res = fromNumber(m)
45 | prefix = res.prefix
46 | suffix = res.suffix
47 | num = res.num
48 | hasComma = res.hasComma
49 | }
50 | return {
51 | hasComma,
52 | prefix,
53 | num,
54 | suffix,
55 | isText,
56 | isOrdinal: m.has('#Ordinal'),
57 | isFraction: m.has('#Fraction'),
58 | isMoney: m.has('#Money'),
59 | }
60 | }
61 | export default parseNumber
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/date/units.js:
--------------------------------------------------------------------------------
1 | import spacetime from 'spacetime'
2 |
3 | class Moment {
4 | constructor(input, opts) {
5 | this.unit = 'millisecond'
6 | this.opts = opts || {}
7 | this.s = spacetime(input, opts.timezone)
8 | }
9 | start() {
10 | this.s = this.s.startOf(this.unit)
11 | return this
12 | }
13 | end() {
14 | this.s = this.s.endOf(this.unit)
15 | return this
16 | }
17 | mid() {
18 | //do nothing
19 | return this
20 | }
21 | iso() {
22 | return this.s.iso()
23 | }
24 | }
25 |
26 |
27 | class Day extends Moment {
28 | constructor(str, opts) {
29 | super(str, opts)
30 | this.unit = 'day'
31 | }
32 | mid() {
33 | this.start()
34 | this.s = this.s.add(12, 'hour')//noon
35 | return this
36 | }
37 | }
38 |
39 | class Week extends Moment {
40 | constructor(str, opts) {
41 | super(str, opts)
42 | this.unit = 'week'
43 | }
44 | mid() {
45 | this.start()
46 | this.s = this.s.add(3, 'day')//wednesday
47 | return this
48 | }
49 | }
50 |
51 | class Month extends Moment {
52 | constructor(str, opts) {
53 | super(str, opts)
54 | this.unit = 'month'
55 | }
56 | mid() {
57 | this.start()
58 | this.s = this.s.add(14, 'days')
59 | return this
60 | }
61 | }
62 |
63 | class Year extends Moment {
64 | constructor(str, opts) {
65 | super(str, opts)
66 | this.unit = 'year'
67 | }
68 | mid() {
69 | this.start()
70 | this.s = this.s.add(6, 'months')
71 | return this
72 | }
73 | }
74 |
75 | export { Moment, Month, Day, Week, Year }
76 |
--------------------------------------------------------------------------------
/plugins/dates/tests/backburner/ambig-weekday.ignore.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from './_lib.js'
3 | import spacetime from 'spacetime'
4 |
5 | const fmt = (iso) => (iso ? spacetime(iso).format('{iso-short}') : '-')
6 |
7 | test('this monday', function (t) {
8 | let arr = [
9 | ['2020-12-7', '2020-12-07'], //mon (itself)
10 | ['2020-12-8', '2020-12-14'], //tues
11 | ['2020-12-9', '2020-12-14'], //wed
12 | ['2020-12-10', '2020-12-14'], //thu
13 | ['2020-12-11', '2020-12-14'], //fri
14 | ['2020-12-12', '2020-12-14'], //sat
15 | ['2020-12-13', '2020-12-14'], //sun
16 | ]
17 | arr.forEach((a) => {
18 | let doc = nlp('this monday')
19 | let found = doc.dates({ today: a[0] }).json()[0]
20 | t.equal(fmt(found.dates.start), a[1], 'monday-start')
21 | t.equal(fmt(found.dates.end), a[1], 'monday-end')
22 | })
23 | t.end()
24 | })
25 |
26 | // test('last monday', function (t) {
27 | // let arr = [
28 | // ['2020-12-7', '2020-11-30'], //mon (obvious)
29 | // ['2020-12-8', '2020-11-30'], //tues
30 | // ['2020-12-9', '2020-11-30'], //wed
31 | // ['2020-12-10', '2020-11-30'], //thu
32 | // ['2020-12-11', '2020-11-30'], //fri
33 | // ['2020-12-12', '2020-11-30'], //sat
34 | // ['2020-12-13', '2020-11-30'], //sun
35 | // ]
36 | // arr.forEach((a) => {
37 | // let doc = nlp('last monday')
38 | // let found = doc.dates({ today: a[0] }).json()[0]
39 | // t.equal(fmt(found.date.start), a[1], 'last-monday-start')
40 | // t.equal(fmt(found.date.end), a[1], 'last-monday-end')
41 | // })
42 | // t.end()
43 | // })
44 |
--------------------------------------------------------------------------------
/data/lexicon/nouns/uncountables.js:
--------------------------------------------------------------------------------
1 | export default [
2 | 'anglais',
3 | 'os',
4 | 'bois',
5 | 'corps',
6 | 'bras',
7 | 'poids',
8 | 'repas',
9 | 'sens',
10 |
11 |
12 | 'conseils',//advice
13 | 'munitions',//ammunition
14 | 'asperges',//asparagus
15 | 'combles',//attic
16 | 'spectateurs',//audience
17 | 'auditeurs',//
18 | 'baggage',//luggage
19 | 'bagages',//
20 | 'brocolis',//broccoli
21 | 'affaires',//business
22 | 'dégâts',//damage
23 | 'céréales',//cereal
24 | 'échecs',//chess
25 | 'vêtements',//clothing
26 | 'coordonnées',//address
27 | 'ténèbres',//darkness
28 | 'datadonnées',//**
29 | 'débris',//debris
30 | 'arrhes',//deposit
31 | 'recherches',//research
32 | 'fiançailles',//engagement
33 | 'remords',//remorse
34 | 'victuailles',//food
35 | 'prévisions',//forecast
36 | 'fruits',//fruit
37 | 'funérailles',//funeral
38 | 'obsèques',//
39 | 'meubles',//furniture
40 | 'garbage',//rubbish
41 | 'ordures',
42 | 'déchets',//
43 | 'graffitis',//graffiti
44 | 'cheveux',//hair
45 | 'ravages',//havoc
46 | 'foins',//hay
47 | 'chevrons',//herringbone
48 | 'devoirs',//homework
49 | 'renseignements',//information
50 | 'médicaments',//medicine
51 | 'abats',//offal
52 | 'pâtes',//pasta
53 | 'décombres',//rubble
54 | 'sciences*',//science
55 | 'crevettes',//shrimp
56 | 'logiciels',//software
57 | 'spaghettis',//spaghetti
58 | 'épinards',//spinach
59 | 'parasites',//static
60 | 'transports',//transportation
61 | 'vacances',//vacation
62 | 'environs',//vicinity
63 | 'fumerolles',//gas
64 | 'noces',//wedding
65 | ]
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/1st-pass/year.js:
--------------------------------------------------------------------------------
1 | const min = 1400
2 | const max = 2100
3 |
4 | const dateWords = new Set(['pendant', 'dans', 'avant', 'apres', 'pour', 'en'])
5 |
6 | const seemsGood = function (term) {
7 | if (!term) {
8 | return false
9 | }
10 | if (dateWords.has(term.normal)) {
11 | return true
12 | }
13 | if (term.tags.has('Date') || term.tags.has('Month') || term.tags.has('WeekDay')) {
14 | return true
15 | }
16 | return false
17 | }
18 |
19 | const seemsOkay = function (term) {
20 | if (!term) {
21 | return false
22 | }
23 | if (term.tags.has('Ordinal')) {
24 | return true
25 | }
26 | return false
27 | }
28 |
29 | // recognize '1993' as a year
30 | const tagYear = function (terms, i, world) {
31 | let setTag = world.methods.one.setTag
32 | const term = terms[i]
33 | if (term.tags.has('NumericValue') && term.tags.has('Cardinal') && term.normal.length === 4) {
34 | let num = Number(term.normal)
35 | // number between 1400 and 2100
36 | if (num && !isNaN(num)) {
37 | if (num > min && num < max) {
38 | if (seemsGood(terms[i - 1]) || seemsGood(terms[i + 1])) {
39 | setTag([term], 'Year', world, false, '2-tagYear')
40 | return true
41 | }
42 | // or is it really-close to a year?
43 | if (num > 1950 && num < 2025) {
44 | if (seemsOkay(terms[i - 1]) || seemsOkay(terms[i + 1])) {
45 | setTag([term], 'Year', world, false, '2-tagYear-close')
46 | return true
47 | }
48 | }
49 | }
50 | }
51 | }
52 | return null
53 | }
54 | export default tagYear
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/adj-gender.js:
--------------------------------------------------------------------------------
1 | // maître
2 | // traître
3 |
4 | const guessGender = function (str) {
5 | // female singular
6 | if (str.match(/[eë]$/)) {
7 | return 'f'
8 | }
9 | // female plurals
10 | let suffixes = [
11 | /[aei]lles$/,
12 | /[aei]les$/,
13 | /[aeiou]ttes$/,
14 | /ntes$/,
15 | /i[vct]es$/,
16 | /uses$/,
17 | /sses$/,
18 | /[èuay]res$/,
19 | /ires$/,
20 | /ées$/,
21 | /ues$/,
22 | /ies$/,
23 | /ée$/,
24 | /[ndvt]es$/,
25 | ]
26 | for (let i = 0; i < suffixes.length; i += 1) {
27 | if (suffixes[i].test(str)) {
28 | return 'f'
29 | }
30 | }
31 |
32 |
33 | return 'm'
34 | }
35 |
36 | // guess a gender tag each Adjective
37 | const adjGender = function (terms, i, world) {
38 | let setTag = world.methods.one.setTag
39 | let term = terms[i]
40 | let tags = term.tags
41 | if (tags.has('Adjective') && !tags.has('FemaleAdjective') && !tags.has('#MaleAdjective')) {
42 | let str = term.implicit || term.normal || term.text || ''
43 | // i actually think there are no exceptions.
44 | if (guessGender(str) === 'f') {
45 | return setTag([term], 'FemaleAdjective', world, false, '3-adj-gender')
46 | } else {
47 | return setTag([term], 'MaleAdjective', world, false, '3-adj-gender')
48 | }
49 | }
50 | return null
51 | }
52 | export default adjGender
53 |
54 | // import data from '../../data/models/adjective/index.js'
55 | // let count = 0
56 | // Object.keys(data).forEach(m => {
57 | // let [f, mp, fp] = data[m]
58 | // if (guessGender(fp) !== 'f') {
59 | // console.log(fp)
60 | // count += 1
61 | // }
62 | // })
63 | // console.log(count)
64 |
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/index.js:
--------------------------------------------------------------------------------
1 | import parseOne from './date/index.js'
2 | import { Moment, Month, Day, Week, Year } from './date/units.js'
3 |
4 |
5 | // generic callback
6 | const startEnd = function (m, opts) {
7 | if (m.found) {
8 | let { start, end } = m.groups()
9 | let out = {
10 | start: parseOne(start, opts),
11 | end: parseOne(end, opts)
12 | }
13 | if (out.start) {
14 | return out
15 | }
16 | }
17 | return null
18 | }
19 | const justStart = function (m, opts) {
20 | let out = { start: parseOne(m, opts) }
21 | if (out.start) {
22 | return out
23 | }
24 | return null
25 | }
26 |
27 | const untilEnd = function (m, opts) {
28 | let { end } = m.groups()
29 | let out = { start: new Moment(opts.today, opts), end: parseOne(end, opts) }
30 | if (out.end) {
31 | // until - just before x
32 | out.end = new Moment(out.end.s.minus(1, 'millisecond'), opts)
33 | return out
34 | }
35 | return null
36 | }
37 |
38 | const phrases = [
39 | // 'entre sept et oct'
40 | { match: 'entre [.*] et [.*]', cb: startEnd },
41 | // 'jusqu'en juin' (until june)
42 | { match: '(jusqu|jusque) (en|a|à|au) [#Date+]', cb: untilEnd },
43 | // fallback to parsing one date
44 | { match: '.*', cb: justStart },
45 | ]
46 |
47 | const parsePhrase = function (matches, opts) {
48 | let arr = []
49 | matches.forEach(view => {
50 | for (let i = 0; i < phrases.length; i += 1) {
51 | let { match, cb } = phrases[i]
52 | let m = view.match(match)
53 | if (m.found) {
54 | let res = cb(m, opts)
55 | if (res) {
56 | arr.push(res)
57 | return
58 | }
59 | }
60 | }
61 |
62 |
63 | })
64 | return arr
65 | }
66 | export default parsePhrase
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "author": "Spencer Kelly (http://spencermounta.in)",
3 | "name": "fr-compromise",
4 | "description": "Linguistique computationnelle modeste",
5 | "version": "0.2.8",
6 | "main": "./builds/fr-compromise.mjs",
7 | "unpkg": "./builds/fr-compromise.min.js",
8 | "type": "module",
9 | "sideEffects": false,
10 | "exports": {
11 | ".": {
12 | "import": "./builds/fr-compromise.mjs",
13 | "require": "./builds/fr-compromise.cjs",
14 | "types": "./types/index.d.ts"
15 | }
16 | },
17 | "types": "types/index.d.ts",
18 | "repository": {
19 | "type": "git",
20 | "url": "git://github.com/nlp-compromise/fr-compromise.git"
21 | },
22 | "scripts": {
23 | "test": "tape \"./tests/**/*.test.js\" | tap-dancer",
24 | "testb": "cross-env TESTENV=prod npm run test",
25 | "build": "npm run version && rollup -c --silent",
26 | "pack": "node ./scripts/pack.js",
27 | "watch": "amble ./scratch.js",
28 | "version": "node ./scripts/version.js",
29 | "score": "node ./learn/giga/test.js",
30 | "lint": "eslint ./src/**/*",
31 | "stress": "node scripts/stress.js"
32 | },
33 | "files": [
34 | "builds/",
35 | "types/",
36 | "src/"
37 | ],
38 | "dependencies": {
39 | "compromise": "14.10.0",
40 | "efrt": "2.7.0",
41 | "suffix-thumb": "5.0.2"
42 | },
43 | "devDependencies": {
44 | "@rollup/plugin-node-resolve": "15.2.0",
45 | "@rollup/plugin-terser": "0.4.3",
46 | "amble": "1.3.0",
47 | "cross-env": "^7.0.3",
48 | "eslint": "8.47.0",
49 | "eslint-plugin-regexp": "1.15.0",
50 | "fr-corpus": "^0.0.1",
51 | "rollup": "3.28.0",
52 | "tap-dancer": "0.3.4",
53 | "tape": "5.6.6"
54 | },
55 | "license": "MIT"
56 | }
57 |
--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | import nlp from './_lib.js'
2 | import tokenize from './01-one/tokenize/plugin.js'
3 | import lexicon from './01-one/lexicon/plugin.js'
4 | import preTagger from './02-two/preTagger/plugin.js'
5 | import postTagger from './02-two/postTagger/plugin.js'
6 | import tagset from './02-two/tagset/plugin.js'
7 | import numbers from './03-three/numbers/plugin.js'
8 | import topics from './03-three/topics/plugin.js'
9 | import verbs from './03-three/verbs/plugin.js'
10 | import adjectives from './03-three/adjectives/plugin.js'
11 | import nouns from './03-three/nouns/plugin.js'
12 | import contractions from './03-three/contractions/plugin.js'
13 | import version from './_version.js'
14 |
15 | nlp.plugin(tokenize)
16 | nlp.plugin(tagset)
17 | nlp.plugin(lexicon)
18 | nlp.plugin(preTagger)
19 | nlp.plugin(postTagger)
20 | nlp.plugin(numbers)
21 | nlp.plugin(topics)
22 | nlp.plugin(verbs)
23 | nlp.plugin(adjectives)
24 | nlp.plugin(nouns)
25 | nlp.plugin(contractions)
26 |
27 | const fr = function (txt, lex) {
28 | let dok = nlp(txt, lex)
29 | return dok
30 | }
31 |
32 | // copy constructor methods over
33 | Object.keys(nlp).forEach(k => {
34 | if (nlp.hasOwnProperty(k)) {
35 | fr[k] = nlp[k]
36 | }
37 | })
38 |
39 | // this one is hidden
40 | Object.defineProperty(fr, '_world', {
41 | value: nlp._world,
42 | writable: true,
43 | })
44 |
45 |
46 |
47 | /** log the decision-making to console */
48 | fr.verbose = function (set) {
49 | let env = typeof process === 'undefined' ? self.env || {} : process.env //use window, in browser
50 | env.DEBUG_TAGS = set === 'tagger' || set === true ? true : ''
51 | env.DEBUG_MATCH = set === 'match' || set === true ? true : ''
52 | env.DEBUG_CHUNKS = set === 'chunker' || set === true ? true : ''
53 | return this
54 | }
55 | fr.version = version
56 |
57 | export default fr
--------------------------------------------------------------------------------
/learn/giga/getPairs.js:
--------------------------------------------------------------------------------
1 | import { forEachSync } from './_giga.js'
2 | import doSentences from './french.js'
3 | import fs from 'fs'
4 |
5 | let ids = []
6 | for (let i = 1; i <= 10; i += 1) {
7 | let str = String(i).padStart(4, '0')
8 | ids.push(str)
9 | }
10 | // ids = ['0004']
11 |
12 | // ABR abbreviation
13 | // ADJ adjective
14 | // ADV adverb
15 |
16 | // VER:pres verb present
17 | // VER:simp verb simple past
18 | // VER:futu verb futur
19 | // VER:cond verb conditional
20 | // VER:impe verb imperative
21 | // VER:impf verb imperfect
22 | // VER:infi verb infinitive
23 | // VER:pper verb past participle
24 | // VER:ppre verb present participle
25 | // VER:subi verb subjunctive imperfect
26 | // VER:subp verb subjunctive present
27 |
28 | // "NOM": true,
29 | let pairs = {}
30 | const tag = 'NOM'
31 | // const prev = 'les'
32 |
33 | let results = {}
34 | const doBoth = function (both) {
35 | let terms = both.fr
36 | terms.forEach((term, i) => {
37 | if (i === 0) {
38 | return
39 | }
40 | if (term['$'].pos === tag) {
41 | console.log(term)
42 | // let last = terms[i - 1]['$text'].toLowerCase()
43 | // if (last === prev) {
44 | // let w = term['$text']
45 | // let inf = term['$'].lem
46 | // // console.log(last, w, inf)
47 | // if (w && inf) {
48 | // w = w.toLowerCase().trim()
49 | // inf = inf.toLowerCase().trim()
50 | // results[w] = inf
51 | // }
52 | // }
53 | }
54 | })
55 | }
56 |
57 | await forEachSync(ids, async id => {
58 | try {
59 | console.log(`\ndoing ${id}:\n`)
60 | await doSentences(id, doBoth)
61 | } catch (e) {
62 | console.log(e)
63 | }
64 | })
65 | console.log('done')
66 | results = Object.entries(results)
67 | fs.writeFileSync('./pairs.js', 'export default ' + JSON.stringify(results))
68 |
--------------------------------------------------------------------------------
/types/view/fr.ts:
--------------------------------------------------------------------------------
1 | import View from './one'
2 |
3 |
4 | interface Numbers extends View {
5 | /** grab the parsed number */
6 | parse: (n?: number) => object[]
7 | /** grab the parsed number */
8 | get: (n?: number) => number | number[]
9 | /** grab 'kilos' from `25 kilos' */
10 | // units: () => View
11 | /** return only ordinal numbers */
12 | isOrdinal: () => View
13 | /** return only cardinal numbers */
14 | isCardinal: () => View
15 | /** convert number to `5` or `5th` */
16 | toNumber: () => View
17 | /** add commas, or nicer formatting for numbers */
18 | toLocaleString: () => View
19 | /** convert number to `five` or `fifth` */
20 | toText: () => View
21 | /** convert number to `five` or `5` */
22 | toCardinal: () => View
23 | /** convert number to `fifth` or `5th` */
24 | toOrdinal: () => View
25 | /** return numbers with this value */
26 | isEqual: () => View
27 | /** return numbers bigger than n */
28 | greaterThan: (min: number) => View
29 | /** return numbers smaller than n */
30 | lessThan: (max: number) => View
31 | /** return numbers between min and max */
32 | between: (min: number, max: number) => View
33 | /** set number to n */
34 | set: (n: number) => View
35 | /** increase number by n */
36 | add: (n: number) => View
37 | /** decrease number by n*/
38 | subtract: (n: number) => View
39 | /** increase number by 1 */
40 | increment: () => View
41 | /** decrease number by 1*/
42 | decrement: () => View
43 | }
44 |
45 | interface Contractions extends View {
46 | /** */
47 | expand(): View
48 | }
49 |
50 |
51 |
52 | interface FrView extends View {
53 | /** return any multi-word terms, like "didn't" */
54 | contractions: (n?: number) => Contractions
55 | /** */
56 | numbers(): Numbers
57 | /** */
58 | topics(): View
59 | }
60 |
61 | export default FrView
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/verb-tense.js:
--------------------------------------------------------------------------------
1 | const tenses = [
2 | 'PresentTense',
3 | 'Infinitive',
4 | 'Imperative',
5 | 'Gerund',
6 | 'PastTense',
7 | 'Modal',
8 | 'Auxiliary',
9 | 'PerfectTense',
10 | 'Pluperfect',
11 | 'ConditionalVerb',
12 | 'FutureTense',
13 | ]
14 |
15 |
16 | let whichTense = [
17 |
18 | //er - present conditional
19 | ['erais', 'ConditionalVerb'],
20 | ['erait', 'ConditionalVerb'],
21 | ['erions', 'ConditionalVerb'],
22 | ['eriez', 'ConditionalVerb'],
23 | ['eraient', 'ConditionalVerb'],
24 |
25 | //er- future
26 | ['erai', 'FutureTense'],
27 | ['era', 'FutureTense'],
28 | ['erons', 'FutureTense'],
29 | ['erez', 'FutureTense'],
30 | ['eront', 'FutureTense'],
31 |
32 | // er - imparfait -> PastTense
33 | ['ais', 'PastTense'],
34 | ['ait', 'PastTense'],
35 | ['ions', 'PastTense'],
36 | ['iez', 'PastTense'],
37 | ['ient', 'PastTense'],
38 |
39 | // past-participle
40 | ['ées', 'PastParticiple'],
41 | ['és', 'PastParticiple'],
42 | ['ée', 'PastParticiple'],
43 | ['é', 'Participle'],
44 | ['u', 'Participle'],//entendu
45 | ]
46 |
47 |
48 | // guess a tense tag each Verb
49 | const verbTense = function (terms, i, world) {
50 | let setTag = world.methods.one.setTag
51 | let term = terms[i]
52 | let tags = term.tags
53 | if (tags.has('Verb')) {
54 | // console.log(term)
55 | let str = term.implicit || term.normal || term.text || ''
56 | // if we have no tense
57 | if (!tenses.find(s => tags.has(s))) {
58 | let found = whichTense.find(a => str.endsWith(a[0]))
59 | if (found) {
60 | setTag([term], found[1], world, false, '3-tense-suffix-' + found[1])
61 | } else {
62 | setTag([term], 'PresentTense', world, false, '3-tense-fallback')
63 | }
64 | }
65 | }
66 | return null
67 | }
68 | export default verbTense
--------------------------------------------------------------------------------
/learn/wikinews/getSuffix.js:
--------------------------------------------------------------------------------
1 | let lines = require('./parse')
2 | // lines = lines.slice(0, 300)
3 | const end = 5
4 |
5 | // 'P+D': 241,
6 | // ADJ: 719,
7 | // ADV: 311,
8 | // CC: 172,
9 | // CLO: 32,
10 | // CLR: 53,
11 | // CLS: 88,
12 | // CS: 90,
13 | // DET: 1353,
14 | // ET: 136,
15 |
16 | // nouns:
17 | // NC: 1877,
18 | // NPP: 493,
19 | // P: 1242,
20 | // PREF: 8,
21 |
22 | // PRO: 43, //pronoun
23 | // PROREL: 89, //relative pronoun
24 | // U: 100,
25 |
26 | // V: 509,
27 | // VINF: 140,
28 | // VPP: 402,
29 | // VPR: 61,
30 | // VS: 10,
31 |
32 | let tags = {}
33 | lines.forEach((s) => {
34 | s.forEach((w) => {
35 | let len = w.word.length
36 | if (len <= end) {
37 | return
38 | }
39 | let suffix = w.word.toLowerCase().substr(len - end, len)
40 | // suffix = suffix.replace(/[éèêë]/, 'e')
41 | // suffix = suffix.replace(/[ï]/, 'i')
42 | // suffix = suffix.replace(/[û]/, 'u')
43 | if (suffix.match(/[0-9]/)) {
44 | return
45 | }
46 | tags[suffix] = tags[suffix] || {}
47 | tags[suffix][w.tag] = tags[suffix][w.tag] || 0
48 | tags[suffix][w.tag] += 1
49 | })
50 | })
51 |
52 | let found = {}
53 | const wantTag = 'N'
54 | Object.keys(tags).forEach((k) => {
55 | let foundTags = Object.keys(tags[k])
56 | if (foundTags.length === 2 && tags[k][wantTag] > 5) {
57 | foundTags.forEach((tag) => {
58 | if (tags[k][tag] === 1) {
59 | delete tags[k][tag]
60 | }
61 | })
62 | foundTags = Object.keys(tags[k])
63 | // console.log(tags[k])
64 | // console.log(foundTags)
65 | }
66 | if (foundTags.length === 1) {
67 | let count = tags[k][foundTags[0]]
68 | if (count > 1 && foundTags[0] === wantTag) {
69 | if (tags[k][wantTag] > 90) {
70 | // console.log(tags[k])
71 | found[k] = foundTags[0]
72 | }
73 | // console.log(k+':' foundTags[0], count)
74 | }
75 | }
76 | })
77 | console.log(found)
78 |
--------------------------------------------------------------------------------
/tests/conjugate.test.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from './_lib.js'
3 | let here = '[conjugate] '
4 | nlp.verbose(false)
5 |
6 | test('adj-conjugate:', function (t) {
7 | let all = ["sanglant", "sanglante", "sanglants", "sanglantes"]
8 | t.deepEqual(Object.values(nlp(all[0]).adjectives().conjugate()[0]), all, here + 'from-male')
9 | t.deepEqual(Object.values(nlp(all[1]).adjectives().conjugate()[0]), all, here + 'from-female')
10 | t.deepEqual(Object.values(nlp(all[2]).adjectives().conjugate()[0]), all, here + 'from-plural')
11 | t.deepEqual(Object.values(nlp(all[3]).adjectives().conjugate()[0]), all, here + 'from-female-plural')
12 | t.end()
13 | })
14 |
15 | test('noun-conjugate:', function (t) {
16 | let all = ["cargaison", "cargaisons"]
17 | let o = nlp(all[0]).nouns().conjugate()[0]
18 | t.deepEqual([o.singular, o.plural], all, here + 'from-sing')
19 | o = nlp(all[1]).nouns().conjugate()[0]
20 | t.deepEqual([o.singular, o.plural], all, here + 'from-plural')
21 |
22 | all = ["bois", "bois"]
23 | o = nlp(all[0]).nouns().conjugate()[0]
24 | t.deepEqual([o.singular, o.plural], all, here + 'from-sing')
25 | t.end()
26 | })
27 |
28 | test('verb-conjugate:', function (t) {
29 | let all = ["endors", "endors", "endort", "endormons", "endormez", "endorment"]
30 | t.deepEqual(Object.values(nlp(all[0]).verbs().conjugate()[0].PresentTense), all, here + 'from-first')
31 | t.deepEqual(Object.values(nlp(all[1]).verbs().conjugate()[0].PresentTense), all, here + 'from-2nd')
32 | t.deepEqual(Object.values(nlp(all[2]).verbs().conjugate()[0].PresentTense), all, here + 'from-3d')
33 | t.deepEqual(Object.values(nlp(all[3]).verbs().conjugate()[0].PresentTense), all, here + 'from-1p')
34 | t.deepEqual(Object.values(nlp(all[4]).verbs().conjugate()[0].PresentTense), all, here + 'from-2p')
35 | t.deepEqual(Object.values(nlp(all[5]).verbs().conjugate()[0].PresentTense), all, here + 'from-3p')
36 | t.end()
37 | })
--------------------------------------------------------------------------------
/src/01-one/tokenize/unicode.js:
--------------------------------------------------------------------------------
1 | //a hugely-ignorant, and widely subjective transliteration of latin, cryllic, greek unicode characters to english ascii.
2 | //approximate visual (not semantic or phonetic) relationship between unicode and ascii characters
3 | //http://en.wikipedia.org/wiki/List_of_Unicode_characters
4 | //https://docs.google.com/spreadsheet/ccc?key=0Ah46z755j7cVdFRDM1A2YVpwa1ZYWlpJM2pQZ003M0E
5 |
6 |
7 | // allowed french symbols
8 | // ç – la cédille (the cedilla)
9 | // é – l'accent aigu (the acute accent)
10 | // â/ê/î/ô/û – l'accent circonflexe (the circumflex)
11 | // à/è/ì/ò/ù – l'accent grave (the grave accent)
12 | // ë/ï/ü
13 | let compact = {
14 | '!': '¡',
15 | '?': '¿Ɂ',
16 | '"': '“”"❝❞',
17 | "'": '‘‛❛❜’',
18 | '-': '—–',
19 | a: 'ªÁÃÄÅáãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧȺΆΑΔΛάαλАаѦѧӐӑӒӓƛæ',
20 | b: 'ßþƀƁƂƃƄƅɃΒβϐϦБВЪЬвъьѢѣҌҍ',
21 | c: '¢©ĆćĈĉĊċČčƆƇƈȻȼͻͼϲϹϽϾСсєҀҁҪҫ',
22 | d: 'ÐĎďĐđƉƊȡƋƌ',
23 | e: 'ĒēĔĕĖėĘęĚěƐȄȅȆȇȨȩɆɇΈΕΞΣέεξϵЀЁЕеѐёҼҽҾҿӖӗ',
24 | f: 'ƑƒϜϝӺӻҒғſ',
25 | g: 'ĜĝĞğĠġĢģƓǤǥǦǧǴǵ',
26 | h: 'ĤĥĦħƕǶȞȟΉΗЂЊЋНнђћҢңҤҥҺһӉӊ',
27 | I: 'Í',
28 | i: 'íĨĩĪīĬĭĮįİıƖƗȈȉȊȋΊΐΪίιϊІЇії',
29 | j: 'ĴĵǰȷɈɉϳЈј',
30 | k: 'ĶķĸƘƙǨǩΚκЌЖКжкќҚқҜҝҞҟҠҡ',
31 | l: 'ĹĺĻļĽľĿŀŁłƚƪǀǏǐȴȽΙӀӏ',
32 | m: 'ΜϺϻМмӍӎ',
33 | n: 'ÑñŃńŅņŇňʼnŊŋƝƞǸǹȠȵΝΠήηϞЍИЙЛПийлпѝҊҋӅӆӢӣӤӥπ',
34 | o: 'ÓÕÖØðóõöøŌōŎŏŐőƟƠơǑǒǪǫǬǭǾǿȌȍȎȏȪȫȬȭȮȯȰȱΌΘΟθοσόϕϘϙϬϴОФоѲѳӦӧӨөӪӫ',
35 | p: 'ƤΡρϷϸϼРрҎҏÞ',
36 | q: 'Ɋɋ',
37 | r: 'ŔŕŖŗŘřƦȐȑȒȓɌɍЃГЯгяѓҐґ',
38 | s: 'ŚśŜŝŞşŠšƧƨȘșȿЅѕ',
39 | t: 'ŢţŤťŦŧƫƬƭƮȚțȶȾΓΤτϮТт',
40 | u: 'µÚúŨũŪūŬŭŮůŰűŲųƯưƱƲǓǔǕǖǗǘǙǚǛǜȔȕȖȗɄΰμυϋύ',
41 | v: 'νѴѵѶѷ',
42 | w: 'ŴŵƜωώϖϢϣШЩшщѡѿ',
43 | x: '×ΧχϗϰХхҲҳӼӽӾӿ',
44 | y: 'ÝýÿŶŷŸƳƴȲȳɎɏΎΥΫγψϒϓϔЎУучўѰѱҮүҰұӮӯӰӱӲӳ',
45 | z: 'ŹźŻżŽžƵƶȤȥɀΖ',
46 | oe: 'œ',
47 | }
48 | //decompress data into two hashes
49 | let unicode = {}
50 | Object.keys(compact).forEach(function (k) {
51 | compact[k].split('').forEach(function (s) {
52 | unicode[s] = k
53 | })
54 | })
55 |
56 | export default unicode
--------------------------------------------------------------------------------
/data/lexicon/nouns/masculine.js:
--------------------------------------------------------------------------------
1 | export default ['bateau', 'parapluie',
2 |
3 |
4 | 'échelle',
5 | 'végétale',
6 | 'automobile',
7 | 'file',
8 | 'mobile',
9 | 'année',
10 | 'musée',
11 | 'idée',
12 |
13 | 'pratique',
14 | 'statistique',
15 | 'politique',
16 | 'musique',
17 | 'technique',
18 |
19 | 'table',
20 | 'ensemble',
21 | 'bénéficiaire',
22 | 'commentaire',
23 | 'affaire',
24 | 'partenaire',
25 | 'gestionnaire',
26 | 'fonctionnaire',
27 | 'salaire',
28 |
29 | 'animal',
30 | 'taux',
31 | 'niveau',
32 | 'réseau',
33 | 'bureau',
34 | 'journal',
35 | 'eau',
36 |
37 | 'entente',
38 | 'vente',
39 | 'atteinte',
40 | 'plante',
41 | 'plainte',
42 |
43 | 'jeu',
44 | // 'enjeux',
45 | 'lieu',
46 |
47 | 'perspective',
48 | 'initiative',
49 | 'élève',
50 |
51 | 'objectif',
52 | 'tarif',
53 |
54 | 'avenir',
55 | 'air',
56 |
57 | 'janvier',
58 | 'hiver',
59 | 'mer',
60 | 'dossier',
61 | 'degré',
62 |
63 | 'droit',
64 | 'crédit',
65 | 'profit',
66 | 'endroit',
67 |
68 |
69 |
70 | 'gouvernement',
71 | 'développement',
72 | 'financement',
73 | 'enseignement',
74 | 'rendement',
75 | 'environnement',
76 | 'établissement',
77 | 'enregistrement',
78 | 'document',
79 | 'investissement',
80 | 'moment',
81 | 'règlement',
82 | 'traitement',
83 | 'engagement',
84 | 'paiement',
85 | 'approvisionnement',
86 | 'changement',
87 | 'élément',
88 | 'équipement',
89 | 'événement',
90 | 'fonctionnement',
91 | 'parlement',
92 | 'perfectionnement',
93 | 'agrément',
94 | 'accroissement',
95 | 'renforcement',
96 | 'renouvellement',
97 | 'recensement',
98 | 'remboursement',
99 | 'segment',
100 | 'recrutement',
101 | 'mouvement',
102 |
103 | 'donnée',
104 | 'restaurant',
105 | 'espace',
106 |
107 |
108 | ]
109 |
--------------------------------------------------------------------------------
/types/index.d.ts:
--------------------------------------------------------------------------------
1 | import { Lexicon, Plugin, matchOptions, Match, Net } from './misc'
2 | import View from './view/fr'
3 |
4 | /** parse a given text */
5 | declare function nlp(text: string, lexicon?: Lexicon): View
6 |
7 | // Constructor
8 | declare module nlp {
9 | /** interpret text without tagging */
10 | export function tokenize(text: string, lexicon?: Lexicon): View
11 | /** scan through text with minimal analysis */
12 | export function lazy(text: string, match?: string): View
13 | /** mix-in a compromise plugin */
14 | export function plugin(plugin: Plugin): any
15 | /** mix-in a compromise plugin */
16 | export function extend(plugin: Plugin): any
17 | /** turn a match-string into json */
18 | export function parseMatch(match: string, opts?: matchOptions): object[]
19 | /** grab library internals */
20 | export function world(): object
21 | /** grab library metadata */
22 | export function model(): object
23 | /** grab exposed library methods */
24 | export function methods(): object
25 | /** which compute functions run automatically */
26 | export function hooks(): string[]
27 | /** log our decision-making for debugging */
28 | export function verbose(toLog?: boolean | string): any
29 | /** current semver version of the library */
30 | export const version: string
31 | /** connect new tags to tagset graph */
32 | export function addTags(tags: object): any
33 | /** add new words to internal lexicon */
34 | export function addWords(words: Lexicon): any
35 | /** turn a list of words into a searchable graph */
36 | export function buildTrie(words: string[]): object
37 | /** compile a set of match objects to a more optimized form */
38 | export function buildNet(matches: Match[]): Net
39 | /** add words to the autoFill dictionary */
40 | export function typeahead(words: Lexicon): any
41 | /** export internal methods for plugins */
42 | export interface TypedPlugin extends Plugin { methods: Methods }
43 | }
44 |
45 | export default nlp
46 |
47 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/model/suffixes.js:
--------------------------------------------------------------------------------
1 | const rb = 'Adverb'
2 | const nn = 'Noun'
3 | const vb = 'Verb'
4 | const jj = 'Adjective'
5 | const inf = 'Infinitive'
6 | // const pres = 'PresentTense'
7 |
8 |
9 | export default [
10 | null,
11 | null,
12 | {
13 | //2-letter
14 | ce: nn,//connaissance
15 | ge: nn,
16 | ie: nn,
17 |
18 | er: inf,
19 | ir: inf,
20 | ée: vb,
21 | és: vb,
22 | sé: vb,
23 | ré: vb,
24 | çu: vb,//conçu
25 | ra: vb,//faudra
26 | it: vb,//fournit
27 | ez: vb,//consultez
28 |
29 | if: jj,//descriptif
30 | },
31 | {
32 | //3-letter
33 | ité: nn, //qualité
34 | eur: nn,//directeur
35 | ces: nn,//connaissances
36 |
37 | ées: vb,//énoncées
38 | ait: vb,//devrait
39 | era: vb,//aidera
40 | ser: vb,//utiliser
41 | ter: vb,//adopter
42 |
43 | ive: jj, //
44 | ifs: jj, //relatifs
45 | ile: jj, //civile
46 | ale: jj, //nationale
47 | ble: jj, //capable
48 | aux: jj, //nationaux
49 | eux: jj, //précieux
50 | nte: jj, //différente
51 | },
52 | {
53 | //4-letter
54 | ment: rb,
55 |
56 | elle: jj,
57 | bles: jj,
58 | ales: jj,
59 | ique: jj,
60 | aire: jj,
61 | ives: jj,
62 | ntes: jj, //différentes
63 |
64 | sent: vb,//produisent
65 |
66 | sion: nn,//commission
67 | eurs: nn,//directeurs
68 | tion: nn,//amélioration
69 | ance: nn,//croissance
70 | euse: jj,//rigoureuse
71 | ouce: jj//douce
72 | },
73 | {
74 | //5-letter
75 | tions: nn,//améliorations
76 | ments: nn,//aliments
77 | sions: nn,//commissions
78 |
79 | aient: vb,//auraient
80 | arant: vb,//préparant
81 | irant: vb,//inspirant
82 | orant: vb,//élaborant
83 | urant: vb,//assurant
84 | trant: vb,//montrant
85 | llant: vb,//détaillant
86 |
87 | ouces: jj,//douces
88 | elles: jj,
89 | iques: jj,
90 | aires: jj,
91 | euses: jj
92 | },
93 | {
94 | //6-letter
95 | },
96 | {
97 | //7-letter
98 | },
99 | ]
--------------------------------------------------------------------------------
/src/03-three/nouns/api.js:
--------------------------------------------------------------------------------
1 | export const getNth = (doc, n) => (typeof n === 'number' ? doc.eq(n) : doc)
2 |
3 | // get root form of adjective
4 | const getRoot = function (m) {
5 | m.compute('root')
6 | let str = m.text('root')
7 | // let isPlural = m.has('#PluralNoun')
8 | // if (isPlural) {
9 | // return transform.adjective.fromPlural(str)
10 | // }
11 | return str
12 | }
13 |
14 | const api = function (View) {
15 | class Nouns extends View {
16 | constructor(document, pointer, groups) {
17 | super(document, pointer, groups)
18 | this.viewType = 'Nouns'
19 | }
20 | conjugate(n) {
21 | const methods = this.methods.two.transform.noun
22 | return getNth(this, n).map(m => {
23 | let str = m.text()
24 | if (m.has('#PluralNoun')) {
25 | return {
26 | plural: str,
27 | singular: methods.fromPlural(str)
28 | }
29 | }
30 | if (m.has('#Uncountable')) {
31 | return {
32 | singular: str,
33 | plural: str,
34 | }
35 | }
36 | return {
37 | singular: str,
38 | plural: methods.toPlural(str)
39 | }
40 | }, [])
41 | }
42 | isPlural(n) {
43 | return getNth(this, n).if('#PluralNoun')
44 | }
45 | toPlural(n) {
46 | const methods = this.methods.two.transform.noun
47 | return getNth(this, n).if('#Singular').map(m => {
48 | let str = getRoot(m)
49 | let plural = methods.toPlural(str)
50 | return m.replaceWith(plural)
51 | })
52 | }
53 | toSingular(n) {
54 | const methods = this.methods.two.transform.noun
55 | return getNth(this, n).if('#PluralNoun').map(m => {
56 | let str = getRoot(m)
57 | let singular = methods.fromPlural(str)
58 | return m.replaceWith(singular)
59 | })
60 | }
61 | }
62 |
63 | View.prototype.nouns = function (n) {
64 | let m = this.match('#Noun')
65 | m = getNth(m, n)
66 | return new Nouns(this.document, m.pointer)
67 | }
68 | }
69 | export default api
--------------------------------------------------------------------------------
/src/02-two/tagset/tags/misc.js:
--------------------------------------------------------------------------------
1 | const anything = ['Noun', 'Verb', 'Adjective', 'Adverb', 'Value', 'QuestionWord']
2 |
3 | export default {
4 | Adjective: {
5 | not: ['Noun', 'Verb', 'Adverb', 'Value'],
6 | },
7 | Comparable: {
8 | is: 'Adjective',
9 | },
10 | Comparative: {
11 | is: 'Adjective',
12 | },
13 | Superlative: {
14 | is: 'Adjective',
15 | not: ['Comparative'],
16 | },
17 | MaleAdjective: {
18 | is: 'Adjective',
19 | not: ['FemaleAdjective'],
20 | },
21 | FemaleAdjective: {
22 | is: 'Adjective',
23 | not: ['MaleAdjective'],
24 | },
25 | PluralAdjective: {
26 | is: 'Adjective',
27 | },
28 | NumberRange: {},
29 | Adverb: {
30 | not: ['Noun', 'Verb', 'Adjective', 'Value'],
31 | },
32 |
33 | Determiner: {
34 | not: ['Noun', 'Verb', 'Adjective', 'Adverb', 'QuestionWord', 'Conjunction', 'Preposition'], //allow 'a' to be a Determiner/Value
35 | },
36 | Conjunction: {
37 | not: anything,
38 | },
39 | Preposition: {
40 | not: ['Noun', 'Verb', 'Adjective', 'Adverb', 'QuestionWord'],
41 | },
42 | QuestionWord: {
43 | not: ['Determiner'],
44 | },
45 | Currency: {
46 | is: 'Noun',
47 | },
48 | Expression: {
49 | not: ['Noun', 'Adjective', 'Verb', 'Adverb'],
50 | },
51 | Abbreviation: {},
52 | Url: {
53 | not: ['HashTag', 'PhoneNumber', 'Verb', 'Adjective', 'Value', 'AtMention', 'Email'],
54 | },
55 | PhoneNumber: {
56 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention', 'Email'],
57 | },
58 | HashTag: {},
59 | AtMention: {
60 | is: 'Noun',
61 | not: ['HashTag', 'Email'],
62 | },
63 | Emoji: {
64 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention'],
65 | },
66 | Emoticon: {
67 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention'],
68 | },
69 | Email: {
70 | not: ['HashTag', 'Verb', 'Adjective', 'Value', 'AtMention'],
71 | },
72 | Acronym: {
73 | not: ['PluralNoun', 'RomanNumeral'],
74 | },
75 | Negative: {
76 | not: ['Noun', 'Adjective', 'Value'],
77 | },
78 | Condition: {
79 | not: ['Verb', 'Adjective', 'Noun', 'Value'],
80 | },
81 | }
82 |
--------------------------------------------------------------------------------
/scripts/pack.js:
--------------------------------------------------------------------------------
1 | /* eslint-disable no-console */
2 | import fs from 'fs'
3 | import { pack } from 'efrt'
4 | import { learn, compress } from 'suffix-thumb'
5 | import lexicon from '../data/lexicon/index.js'
6 | import models from '../data/models/index.js'
7 | // import switches from '../lib/switches/index.js'
8 | // import senses from '../lib/senses/index.js'
9 |
10 | const steps = [
11 | {
12 | label: 'lexicon',
13 | path: './src/01-one/lexicon/model/_data.js',
14 | compress: function () {
15 | let packed = {}
16 | //turn them into a series of flat-arrays
17 | Object.keys(lexicon).forEach(word => {
18 | let tags = lexicon[word]
19 | if (typeof tags === 'string') {
20 | tags = [tags]
21 | }
22 | tags.forEach(tag => {
23 | packed[tag] = packed[tag] || []
24 | packed[tag].push(word)
25 | })
26 | })
27 | //pack each array into a tiny string
28 | Object.keys(packed).forEach(tag => {
29 | packed[tag] = pack(packed[tag])
30 | })
31 | return packed
32 | },
33 | },
34 | {
35 | label: 'models',
36 | path: './src/01-one/lexicon/methods/_data.js',
37 | compress: function () {
38 | let packed = {}
39 | Object.keys(models).forEach(k => {
40 | packed[k] = {}
41 | Object.keys(models[k]).forEach(form => {
42 | let pairs = models[k][form]
43 | console.log(k, form)
44 | packed[k][form] = learn(pairs)
45 | packed[k][form] = compress(packed[k][form])
46 | })
47 | })
48 | return packed
49 | },
50 | }
51 | ]
52 |
53 | // run through all our steps
54 | steps.forEach(obj => {
55 | console.log(`\n 🕑 - packing ${obj.label}..`)
56 | const packed = obj.compress()
57 |
58 | //write it to a file in ./src
59 | const banner = `// generated in ./lib/${obj.label}\n`
60 | fs.writeFileSync(obj.path, banner + 'export default ' + JSON.stringify(packed, null, 2), 'utf8')
61 |
62 | //get filesize
63 | const stats = fs.statSync(obj.path)
64 | let size = (stats.size / 1000.0).toFixed(1)
65 | console.log(` - ${obj.label} is ` + size + 'k\n')
66 | })
67 |
--------------------------------------------------------------------------------
/src/01-one/lexicon/model/misc.js:
--------------------------------------------------------------------------------
1 | export default {
2 | // copulas (incomplete)
3 | es: ['Copula', 'PresentTense'],
4 | est: ['Copula', 'PresentTense'],
5 | suis: ['Copula', 'PresentTense'],
6 | sommes: ['Copula', 'PresentTense'],
7 | etes: ['Copula', 'PresentTense'],
8 | sont: ['Copula', 'PresentTense'],
9 |
10 | ete: ['Copula', 'PastTense'],
11 | etais: ['Copula', 'PastTense'],
12 | etions: ['Copula', 'PastTense'],
13 |
14 | serons: ['Copula', 'FutureTense'],
15 | seront: ['Copula', 'FutureTense'],
16 | serai: ['Copula', 'FutureTense'],
17 |
18 | cent: ['Multiple', 'Cardinal'],
19 | mille: ['Multiple', 'Cardinal'],
20 | million: ['Multiple', 'Cardinal'],
21 | milliard: ['Multiple', 'Cardinal'],
22 | quadrillion: ['Multiple', 'Cardinal'],
23 | centième: ['Multiple', 'Ordinal'],
24 | millième: ['Multiple', 'Ordinal'],
25 | millionième: ['Multiple', 'Ordinal'],
26 | milliardième: ['Multiple', 'Ordinal'],
27 | billionième: ['Multiple', 'Ordinal'],
28 | trillionième: ['Multiple', 'Ordinal'],
29 | // plural numbers
30 | septs: ['TextValue', 'Cardinal'],
31 |
32 | cents: ['Multiple', 'Cardinal'],
33 | milles: ['Multiple', 'Cardinal'],
34 | millions: ['Multiple', 'Cardinal'],
35 | milliards: ['Multiple', 'Cardinal'],
36 |
37 | êtes: ['Copula', 'PresentTense'],
38 | étions: ['Copula', 'PresentTense'],
39 | serez: ['Copula', 'PresentTense'],
40 | été: ['Copula'],
41 | fus: ['Copula', 'PastTense'],
42 | fut: ['Copula', 'PastTense'],
43 | fûmes: ['Copula', 'PastTense'],
44 | fûtes: ['Copula', 'PastTense'],
45 | furent: ['Copula', 'PastTense'],
46 | fusse: ['Copula', 'PastTense'],
47 | fusses: ['Copula', 'PastTense'],
48 | fût: ['Copula', 'PastTense'],
49 | fussions: ['Copula', 'PastTense'],
50 | fussiez: ['Copula', 'PastTense'],
51 | fussent: ['Copula', 'PastTense'],
52 | serais: ['Copula', 'PresentTense'],
53 | serait: ['Copula', 'PresentTense'],
54 | serions: ['Copula', 'PresentTense'],
55 | seriez: ['Copula', 'PresentTense'],
56 | seraient: ['Copula', 'PresentTense'],
57 | sois: ['Copula', 'PresentTense'],
58 | soyons: ['Copula', 'PresentTense'],
59 | soyez: ['Copula', 'PresentTense'],
60 | être: ['Copula', 'PresentTense'],
61 |
62 |
63 |
64 | }
--------------------------------------------------------------------------------
/src/03-three/numbers/format/toText.js:
--------------------------------------------------------------------------------
1 | import data from '../data.js'
2 | let ones = data.ones.reverse()
3 | let tens = data.tens.reverse()
4 |
5 | let multiples = [
6 | [1e12, 'mille milliard'],
7 | [1e11, 'cent milliard'],
8 | [1e9, 'milliard'],
9 | [1e8, 'cent million'],
10 | [1e6, 'million'],
11 | [100000, 'cent mille'],
12 | [1000, 'mille'],
13 | [100, 'cent'],
14 | [1, 'one'],
15 | ]
16 |
17 | //turn number into an array of magnitudes, like [[5, million], [2, hundred]]
18 | const getMagnitudes = function (num) {
19 | let working = num
20 | let have = []
21 | multiples.forEach(a => {
22 | if (num >= a[0]) {
23 | let howmany = Math.floor(working / a[0])
24 | working -= howmany * a[0]
25 | if (howmany) {
26 | have.push({
27 | unit: a[1],
28 | num: howmany,
29 | })
30 | }
31 | }
32 | })
33 | return have
34 | }
35 |
36 | const twoDigit = function (num) {
37 | let words = []
38 | // 20-90
39 | for (let i = 0; i < tens.length; i += 1) {
40 | if (tens[i][0] <= num) {
41 | words.push(tens[i][1])
42 | num -= tens[i][0]
43 | break
44 | }
45 | }
46 | if (num === 0) {
47 | return words
48 | }
49 | // 0-19
50 | for (let i = 0; i < ones.length; i += 1) {
51 | if (ones[i][0] <= num) {
52 | // 'et un'
53 | if (words.length && ones[i][1] === 'un') {
54 | words.push('et')
55 | }
56 | words.push(ones[i][1])
57 | num -= ones[i][0]
58 | break
59 | }
60 | }
61 | return words
62 | }
63 |
64 | // turn a number like 80 into words like 'quatre vingt'
65 | const toText = function (num) {
66 | if (num === 0) {
67 | return ['zero']
68 | }
69 | let words = []
70 | if (num < 0) {
71 | words.push('moins')
72 | num = Math.abs(num)
73 | }
74 | // handle multiples
75 | let found = getMagnitudes(num)
76 | found.forEach(obj => {
77 | let res = twoDigit(obj.num)
78 | if (obj.num === 1 && obj.unit !== 'one') {
79 | // don't add reduntant 'un cent'
80 | } else {
81 | words = words.concat(res)
82 | }
83 | if (obj.unit !== 'one') {
84 | words.push(obj.unit)
85 | }
86 | })
87 | return words
88 | }
89 | export default toText
--------------------------------------------------------------------------------
/src/01-one/lexicon/compute/root.js:
--------------------------------------------------------------------------------
1 | const verbForm = function (term) {
2 | let want = [
3 | 'FirstPerson',
4 | 'SecondPerson',
5 | 'ThirdPerson',
6 | 'FirstPersonPlural',
7 | 'SecondPersonPlural',
8 | 'ThirdPersonPlural',
9 | ]
10 | return want.find(tag => term.tags.has(tag))
11 | }
12 |
13 | const root = function (view) {
14 | const transform = view.world.methods.two.transform
15 | view.docs.forEach(terms => {
16 | terms.forEach(term => {
17 | let str = term.implicit || term.normal || term.text
18 | // nouns -> singular masculine form
19 | if (term.tags.has('Noun') && !term.tags.has('Pronoun')) {
20 | let isPlural = term.tags.has('PluralNoun')
21 | // let isFemale = term.tags.has('FemaleNoun')
22 | if (isPlural) {
23 | term.root = transform.noun.fromPlural(str)
24 | }
25 | }
26 | // adjectives -> singular masculine form
27 | if (term.tags.has('Adjective')) {
28 | let isPlural = term.tags.has('PluralAdjective')
29 | let isFemale = term.tags.has('FemaleAdjective')
30 | if (isPlural && isFemale) {
31 | term.root = transform.adjective.fromFemalePlural(str)
32 | } else if (isFemale) {
33 | term.root = transform.adjective.fromFemale(str)
34 | } else if (isPlural) {
35 | term.root = transform.adjective.fromPlural(str)
36 | }
37 | }
38 | // verbs -> infinitive form
39 | if (term.tags.has('Verb')) {
40 | if (term.tags.has('PresentTense')) {
41 | let form = verbForm(term)
42 | term.root = transform.verb.fromPresentTense(str, form)
43 | }
44 | if (term.tags.has('FutureTense')) {
45 | let form = verbForm(term)
46 | term.root = transform.verb.fromFutureTense(str, form)
47 | }
48 | if (term.tags.has('Passive')) {
49 | let form = verbForm(term)
50 | term.root = transform.verb.fromPassive(str, form)
51 | } else if (term.tags.has('PastTense')) {
52 | let form = verbForm(term)
53 | term.root = transform.verb.fromPastParticiple(str, form)
54 | }
55 | // fromImperfectTense, fromPastParticiple
56 | }
57 | })
58 | })
59 | }
60 | export default root
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/index.js:
--------------------------------------------------------------------------------
1 | // 1st pass
2 | import checkRegex from './1st-pass/regex.js'
3 | import titleCase from './1st-pass/titlecase.js'
4 | import checkYear from './1st-pass/year.js'
5 | // 2nd pass
6 | import acronym from './2nd-pass/acronym.js'
7 | import neighbours from './2nd-pass/neighbours.js'
8 | import nounFallback from './2nd-pass/noun-fallback.js'
9 | import suffixCheck from './2nd-pass/suffix-lookup.js'
10 | // 3rd pass
11 | import nounGender from './3rd-pass/noun-gender.js'
12 | import nounPlurals from './3rd-pass/noun-plurals.js'
13 | import adjPlurals from './3rd-pass/adj-plurals.js'
14 | import adjGender from './3rd-pass/adj-gender.js'
15 | import verbTense from './3rd-pass/verb-tense.js'
16 | import verbForm from './3rd-pass/verb-form.js'
17 | import numberTypes from './3rd-pass/number-types.js'
18 | import fixContractions from './3rd-pass/fix-contractions.js'
19 |
20 | // these methods don't care about word-neighbours
21 | const firstPass = function (terms, world) {
22 | for (let i = 0; i < terms.length; i += 1) {
23 | // is it titlecased?
24 | let found = titleCase(terms, i, world)
25 | // try look-like rules
26 | found = found || checkRegex(terms, i, world)
27 | // turn '1993' into a year
28 | checkYear(terms, i, world)
29 | }
30 | }
31 | const secondPass = function (terms, world) {
32 | for (let i = 0; i < terms.length; i += 1) {
33 | let found = acronym(terms, i, world)
34 | found = found || suffixCheck(terms, i, world)
35 | found = found || neighbours(terms, i, world)
36 | found = found || nounFallback(terms, i, world)
37 | }
38 | }
39 | const thirdPass = function (terms, world) {
40 | for (let i = 0; i < terms.length; i += 1) {
41 | nounGender(terms, i, world)
42 | nounPlurals(terms, i, world)
43 | adjPlurals(terms, i, world)
44 | adjGender(terms, i, world)
45 | verbTense(terms, i, world)
46 | verbForm(terms, i, world)
47 | numberTypes(terms, i, world)
48 | }
49 | // (4th pass)
50 | for (let i = 0; i < terms.length; i += 1) {
51 | fixContractions(terms, i, world)
52 | }
53 | }
54 |
55 |
56 | const tagger = function (view) {
57 | let world = view.world
58 | view.docs.forEach(terms => {
59 | firstPass(terms, world)
60 | secondPass(terms, world)
61 | thirdPass(terms, world)
62 | })
63 | return view
64 | }
65 | export default tagger
--------------------------------------------------------------------------------
/types/misc.ts:
--------------------------------------------------------------------------------
1 | export type Document = Term[][]
2 |
3 | export type Pointer = [n?: number, start?: number, end?: number, startId?: string, endId?: string]
4 |
5 | export type outMethods = 'text' | 'normal' | 'offset' | 'terms' | 'topk' | 'json' | 'tags' | 'array' | 'debug'
6 |
7 | export type Groups = object
8 |
9 | export interface Term {
10 | text: string,
11 | pre: string,
12 | post: string,
13 | normal: string,
14 |
15 | // in /two
16 | tags?: Set,
17 | index?: [n?: number, start?: number],
18 | id?: string,
19 | chunk?: string,
20 | dirty?: boolean
21 |
22 | // other things you may find...
23 | syllables?: string[],
24 | }
25 |
26 | // possible values to .json()
27 | export interface JsonProps {
28 | /** a perfect copy of the input text */
29 | text?: boolean
30 | /** normalized whitespace, case, unicode, punctuation */
31 | normal?: boolean
32 | /** lowercase, trimmed, contractions expanded. */
33 | reduced?: boolean
34 | /** cleanup whitespace */
35 | trim?: boolean
36 | /** character-position where this begins */
37 | offset?: boolean
38 | /** frequency of this match in the document */
39 | count?: boolean
40 | /** remove duplicate results*/
41 | unique?: boolean
42 | /** starting term # in document */
43 | index?: boolean
44 | /** options for each term */
45 | terms?: {
46 | text?: boolean
47 | normal?: boolean
48 | clean?: boolean
49 | implicit?: boolean
50 | tags?: boolean
51 | whitespace?: boolean
52 | id?: boolean
53 | offset?: boolean
54 | bestTag?: boolean
55 | }
56 | }
57 |
58 | // a key-value object of words, terms
59 | export interface Lexicon {
60 | [key: string]: string
61 | }
62 |
63 | export interface Plugin {
64 | methods?: object,
65 | model?: object,
66 | compute?: object,
67 | hooks?: string[],
68 | tags?: object,
69 | words?: object,
70 | lib?: () => object,
71 | api?: (fn: (view: any) => {}) => void, //should be View
72 | mutate?: (fn: (world: object) => {}) => void,
73 | }
74 |
75 | export interface matchOptions {
76 | fuzzy?: number,
77 | caseSensitive?: boolean,
78 | }
79 |
80 | export interface Match {
81 | match: string,
82 | tag?: string | string[],
83 | unTag?: string | string[],
84 | group?: string | number,
85 | reason?: string,
86 | }
87 |
88 | export interface Net {
89 | hooks: object,
90 | always?: any,
91 | isNet: boolean
92 | }
--------------------------------------------------------------------------------
/src/02-two/tagset/tags/nouns.js:
--------------------------------------------------------------------------------
1 | const entity = ['Person', 'Place', 'Organization']
2 |
3 | export default {
4 | Noun: {
5 | not: ['Verb', 'Adjective', 'Adverb', 'Value', 'Determiner'],
6 | },
7 | Singular: {
8 | is: 'Noun',
9 | not: ['PluralNoun'],
10 | },
11 | ProperNoun: {
12 | is: 'Noun',
13 | },
14 | Person: {
15 | is: 'Singular',
16 | also: ['ProperNoun'],
17 | not: ['Place', 'Organization', 'Date'],
18 | },
19 | FirstName: {
20 | is: 'Person',
21 | },
22 | MaleName: {
23 | is: 'FirstName',
24 | not: ['FemaleName', 'LastName'],
25 | },
26 | FemaleName: {
27 | is: 'FirstName',
28 | not: ['MaleName', 'LastName'],
29 | },
30 | LastName: {
31 | is: 'Person',
32 | not: ['FirstName'],
33 | },
34 | Honorific: {
35 | is: 'Noun',
36 | not: ['FirstName', 'LastName', 'Value'],
37 | },
38 | Place: {
39 | is: 'Singular',
40 | not: ['Person', 'Organization'],
41 | },
42 | Country: {
43 | is: 'Place',
44 | also: ['ProperNoun'],
45 | not: ['City'],
46 | },
47 | City: {
48 | is: 'Place',
49 | also: ['ProperNoun'],
50 | not: ['Country'],
51 | },
52 | Region: {
53 | is: 'Place',
54 | also: ['ProperNoun'],
55 | },
56 | Address: {
57 | // is: 'Place',
58 | },
59 | Organization: {
60 | is: 'ProperNoun',
61 | not: ['Person', 'Place'],
62 | },
63 | SportsTeam: {
64 | is: 'Organization',
65 | },
66 | School: {
67 | is: 'Organization',
68 | },
69 | Company: {
70 | is: 'Organization',
71 | },
72 | PluralNoun: {
73 | is: 'Noun',
74 | not: ['Singular'],
75 | },
76 | Uncountable: {
77 | is: 'Noun',
78 | },
79 | Pronoun: {
80 | is: 'Noun',
81 | not: entity,
82 | },
83 | Actor: {
84 | is: 'Noun',
85 | not: entity,
86 | },
87 | Activity: {
88 | is: 'Noun',
89 | not: ['Person', 'Place'],
90 | },
91 | Unit: {
92 | is: 'Noun',
93 | not: entity,
94 | },
95 | Demonym: {
96 | is: 'Noun',
97 | also: ['ProperNoun'],
98 | not: entity,
99 | },
100 | Possessive: {
101 | is: 'Noun',
102 | },
103 | // german genders
104 | MaleNoun: {
105 | is: 'Noun',
106 | not: ['FemaleNoun'],
107 | },
108 | FemaleNoun: {
109 | is: 'Noun',
110 | not: ['MaleNoun'],
111 | },
112 | }
113 |
--------------------------------------------------------------------------------
/data/lexicon/misc/adverbs.js:
--------------------------------------------------------------------------------
1 | // all '-ment' words are tagged by suffix
2 | export default [
3 | 'pas',
4 | // 'plus',
5 | 'ainsi',
6 | 'lors',
7 | 'alors',
8 | 'aussi',
9 | 'donc',
10 | 'tres',
11 | 'très',
12 | 'deja',
13 | 'encore',
14 | // 'tout',
15 | 'bien',
16 | // 'moins',
17 | 'non',
18 | // 'hier',
19 | "jusqu'",
20 | 'meme',
21 | // 'peu',
22 | 'toujours',
23 | 'cependant',
24 | 'ailleurs',
25 | 'toutefois',
26 | // 'ici',
27 | 'environ',
28 | 'quant',
29 | 'que',
30 | 'tandis',
31 | 'beaucoup',
32 | 'outre',
33 | 'qu',
34 | 'ensuite',
35 | 'tant',
36 | 'jamais',
37 | 'enfin',
38 | 'tard',
39 | 'desormais',
40 | // 'maintenant',
41 | 'trop',
42 | 'autant',
43 | 'loin',
44 | 'pourtant',
45 | 'surtout',
46 | 'autour',
47 | 'auparavant',
48 | 'neanmoins',
49 | 'assez',
50 | 'tot',
51 | 'mieux',
52 | 'souvent',
53 | 'plutot',
54 | 'demain',
55 | 'pres',
56 | 'longtemps',
57 | 'presque',
58 | 'peut-etre',
59 | // 'mal',
60 | 'avant',
61 | 'partout',
62 | 'davantage',
63 | 'juste',
64 | 'vite',
65 | 'puis',
66 | 'parfois',
67 | 'guere',
68 | 'au dela',
69 | 'oui',
70 | 'au dessus',
71 | 'ores',
72 | // 'dehors',
73 | 'si',
74 | 'ci',
75 | 'bientot',
76 | // 'ensemble',
77 | 'apres',
78 | 'depuis',
79 | 'quand',
80 | 'quelque',
81 | 'aussitôt',
82 | 'quasi',
83 | // 'fort',
84 | 'vis a vis',
85 | 'dessous',
86 | 'voire',
87 | 'certes',
88 | 'jusque la',
89 | 'ci dessus',
90 | // 'matin',
91 | 'ci dessous',
92 | 'contre',
93 | 'autrefois',
94 | 'combien',
95 | 'comme',
96 | 'sous',
97 | 'inter',
98 | 'la bas',
99 | 'dorenavant',
100 | 'dessus',
101 | 'sans',
102 | 'alias',
103 | 'bel',
104 | 'jadis',
105 | // 'rien',
106 | 'etc',
107 | 'soit',
108 | 'entre temps',
109 | 'avant hier',
110 | "presqu'",
111 | // 'point',
112 | 'la dessus',
113 | 'mais',
114 | 'debout',
115 | 'ultra',
116 | 'bref',
117 | 'naguere',
118 | 'la-dedans',
119 | 'deca',
120 | 'ca',
121 | 'soi-disant',
122 | 'devant',
123 | 'fi',
124 | 'dedans',
125 | 'deja',
126 | 'idem',
127 | 'sic',
128 | 'sitot',
129 | 'derriere',
130 | 'haut',
131 | 'outre mer',
132 | 'crescendo',
133 | 'pourquoi',
134 | 'primo',
135 | 'secundo',
136 | 'tertio',
137 | 'quelqu',
138 | 'ferme',
139 | 'au-dessous',
140 | 'pele mele',
141 | 'sident',
142 | ]
143 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/2nd-pass/acronym.js:
--------------------------------------------------------------------------------
1 | const oneLetterAcronym = /^[A-Z]('s|,)?$/
2 | const isUpperCase = /^[A-Z-]+$/
3 | const periodAcronym = /([A-Z]\.)+[A-Z]?,?$/
4 | const noPeriodAcronym = /[A-Z]{2,}('s|,)?$/
5 | const lowerCaseAcronym = /([a-z]\.)+[a-z]\.?$/
6 |
7 | const oneLetterWord = {
8 | I: true,
9 | A: true,
10 | }
11 | // just uppercase acronyms, no periods - 'UNOCHA'
12 | const isNoPeriodAcronym = function (term, model) {
13 | let str = term.text
14 | // ensure it's all upper-case
15 | if (isUpperCase.test(str) === false) {
16 | return false
17 | }
18 | // long capitalized words are not usually either
19 | if (str.length > 5) {
20 | return false
21 | }
22 | // 'I' is not a acronym
23 | if (oneLetterWord.hasOwnProperty(str)) {
24 | return false
25 | }
26 | // known-words, like 'PIZZA' is not an acronym.
27 | if (model.one.lexicon.hasOwnProperty(term.normal)) {
28 | return false
29 | }
30 | //like N.D.A
31 | if (periodAcronym.test(str) === true) {
32 | return true
33 | }
34 | //like c.e.o
35 | if (lowerCaseAcronym.test(str) === true) {
36 | return true
37 | }
38 | //like 'F.'
39 | if (oneLetterAcronym.test(str) === true) {
40 | return true
41 | }
42 | //like NDA
43 | if (noPeriodAcronym.test(str) === true) {
44 | return true
45 | }
46 | return false
47 | }
48 |
49 | const isAcronym = function (terms, i, world) {
50 | let setTag = world.methods.one.setTag
51 | let term = terms[i]
52 | //these are not acronyms
53 | if (term.tags.has('RomanNumeral') || term.tags.has('Acronym')) {
54 | return null
55 | }
56 | //non-period ones are harder
57 | if (isNoPeriodAcronym(term, world.model)) {
58 | term.tags.clear()
59 | setTag([term], ['Acronym', 'Noun'], world, false, '3-no-period-acronym')
60 | return true
61 | }
62 | // one-letter acronyms
63 | if (!oneLetterWord.hasOwnProperty(term.text) && oneLetterAcronym.test(term.text)) {
64 | term.tags.clear()
65 | setTag([term], ['Acronym', 'Noun'], world, false, '3-one-letter-acronym')
66 | return true
67 | }
68 | //if it's a very-short organization?
69 | if (term.tags.has('Organization') && term.text.length <= 3) {
70 | setTag([term], 'Acronym', world, false, '3-org-acronym')
71 | return true
72 | }
73 | // upper-case org, like UNESCO
74 | if (term.tags.has('Organization') && isUpperCase.test(term.text) && term.text.length <= 6) {
75 | setTag([term], 'Acronym', world, false, '3-titlecase-acronym')
76 | return true
77 | }
78 | return null
79 | }
80 | export default isAcronym
81 |
--------------------------------------------------------------------------------
/src/02-two/preTagger/model/regex/regex-numbers.js:
--------------------------------------------------------------------------------
1 | export default [
2 |
3 | [/^[012]?[0-9]h$/i, 'Time', '04h'],
4 | [/^[012]?[0-9]h[0-9]{2}$/i, 'Time', '23h30'],
5 | [/^'[0-9]{2}$/, 'Year'],
6 | // times
7 | [/^[012]?[0-9](:[0-5][0-9])(:[0-5][0-9])$/, 'Time', '3:12:31'],
8 | [/^[012]?[0-9](:[0-5][0-9])?(:[0-5][0-9])$/, 'Time', '1:12'],
9 |
10 | // iso-dates
11 | [/^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}/i, 'Date', 'iso-date'],
12 | [/^[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,4}$/, 'Date', 'iso-dash'],
13 | [/^[0-9]{1,4}\/[0-9]{1,2}\/[0-9]{1,4}$/, 'Date', 'iso-slash'],
14 | [/^[0-9]{1,4}\.[0-9]{1,2}\.[0-9]{1,4}$/, 'Date', 'iso-dot'],
15 | [/^[0-9]{1,4}-[a-z]{2,9}-[0-9]{1,4}$/i, 'Date', '12-dec-2019'],
16 |
17 | // timezones
18 | [/^utc ?[+-]?[0-9]+$/, 'Timezone', 'utc-9'],
19 | [/^(gmt|utc)[+-][0-9]{1,2}$/i, 'Timezone', 'gmt-3'],
20 |
21 | //phone numbers
22 | [/^[0-9]{3}-[0-9]{4}$/, 'PhoneNumber', '421-0029'],
23 | [/^(\+?[0-9][ -])?[0-9]{3}[ -]?[0-9]{3}-[0-9]{4}$/, 'PhoneNumber', '1-800-'],
24 |
25 |
26 | //money
27 | //like $5.30
28 | [
29 | /^[-+]?[$\xA2-\xA5\u058F\u060B\u09F2\u09F3\u09FB\u0AF1\u0BF9\u0E3F\u17DB\u20A0-\u20BD\uA838\uFDFC\uFE69\uFF04\uFFE0\uFFE1\uFFE5\uFFE6][-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?([kmb]|bn)?\+?$/,
30 | ['Money', 'Value'],
31 | '$5.30',
32 | ],
33 | //like 5.30$
34 | [
35 | /^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?[$\xA2-\xA5\u058F\u060B\u09F2\u09F3\u09FB\u0AF1\u0BF9\u0E3F\u17DB\u20A0-\u20BD\uA838\uFDFC\uFE69\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]\+?$/,
36 | ['Money', 'Value'],
37 | '5.30£',
38 | ],
39 | //like
40 | [/^[-+]?[$£]?[0-9]([0-9,.])+(usd|eur|jpy|gbp|cad|aud|chf|cny|hkd|nzd|kr|rub)$/i, ['Money', 'Value'], '$400usd'],
41 |
42 | //numbers
43 | // 50 | -50 | 3.23 | 5,999.0 | 10+
44 | [/^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?\+?$/, ['Cardinal', 'NumericValue'], '5,999'],
45 | [/^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?(e|er)$/, ['Ordinal', 'NumericValue'], '53rd'],
46 | // .73th
47 | [/^\.[0-9]+\+?$/, ['Cardinal', 'NumericValue'], '.73th'],
48 | //percent
49 | [/^[-+]?[0-9]+(,[0-9]{3})*(\.[0-9]+)?%\+?$/, ['Percent', 'Cardinal', 'NumericValue'], '-4%'],
50 | [/^\.[0-9]+%$/, ['Percent', 'Cardinal', 'NumericValue'], '.3%'],
51 | //fraction
52 | [/^[0-9]{1,4}\/[0-9]{1,4}(e|er)?s?$/, ['Fraction', 'NumericValue'], '2/3rds'],
53 | //range
54 | [/^[0-9.]{1,3}[a-z]{0,2}[-–—][0-9]{1,3}[a-z]{0,2}$/, ['Value', 'NumberRange'], '3-4'],
55 | //time-range
56 | [/^[0-9]{1,2}(:[0-9][0-9])?(am|pm)? ?[-–—] ?[0-9]{1,2}(:[0-9][0-9])?(am|pm)$/, ['Time', 'NumberRange'], '3-4pm'],
57 | //with unit
58 | [/^[0-9.]+([a-z]{1,4})$/, 'Value', '9km'],
59 | ]
--------------------------------------------------------------------------------
/src/01-one/lexicon/model/lexicon.js:
--------------------------------------------------------------------------------
1 | import lexData from './_data.js'
2 | import { unpack } from 'efrt'
3 | import transform from '../methods/index.js'
4 | import misc from './misc.js'
5 |
6 | const tagMap = {
7 | first: 'FirstPerson',
8 | second: 'SecondPerson',
9 | third: 'ThirdPerson',
10 | firstPlural: 'FirstPersonPlural',
11 | secondPlural: 'SecondPersonPlural',
12 | thirdPlural: 'ThirdPersonPlural',
13 | }
14 |
15 | let words = {}
16 | Object.keys(lexData).forEach(tag => {
17 | let wordsObj = unpack(lexData[tag])
18 | Object.keys(wordsObj).forEach(w => {
19 | words[w] = tag
20 |
21 | // expand
22 | if (tag === 'MaleAdjective') {
23 | let res = transform.adjective.conjugate(w)
24 | words[res.female] = words[res.female] || 'FemaleAdjective'
25 | words[res.plural] = words[res.plural] || 'MaleAdjective'
26 | words[res.femalePlural] = words[res.femalePlural] || 'FemaleAdjective'
27 | }
28 | if (tag === 'Cardinal') {
29 | words[w] = ['TextValue', 'Cardinal']
30 | }
31 | if (tag === 'Noun' || tag === 'MaleNoun' || tag === 'FemaleNoun') {
32 | words[w] = [tag, 'Singular']
33 | let plur = transform.noun.toPlural(w)
34 | words[plur] = words[plur] || ['Noun', 'Plural']
35 | }
36 | if (tag === 'Ordinal') {
37 | words[w] = ['TextValue', 'Ordinal']
38 | let norm = w.replace(/è/, 'e')
39 | words[norm] = words[norm] || ['TextValue', 'Ordinal']
40 | }
41 | if (tag === 'MaleNoun') {
42 | let p = transform.noun.toPlural(w)
43 | words[p] = words[p] || 'PluralNoun'
44 | }
45 | if (tag === 'Infinitive') {
46 | // do future-tense
47 | let res = transform.verb.toFutureTense(w)
48 | Object.keys(res).forEach(k => {
49 | if (!words[res[k]]) {
50 | words[res[k]] = words[res[k]] || [tagMap[k], 'FutureTense']
51 | }
52 | })
53 | // do present-tense
54 | res = transform.verb.toPresentTense(w)
55 | Object.keys(res).forEach(k => {
56 | if (!words[res[k]]) {
57 | words[res[k]] = words[res[k]] || [tagMap[k], 'PresentTense']
58 | }
59 | })
60 | // do imperfect mood
61 | res = transform.verb.toImperfect(w)
62 | Object.keys(res).forEach(k => words[res[k]] = words[res[k]] || 'Verb')
63 | // past-participle
64 | let out = transform.verb.toPastParticiple(w)
65 | words[out] = words[out] || 'PastParticiple'
66 | }
67 | })
68 | })
69 |
70 | let lexicon = Object.assign({}, words, misc)
71 | // console.log(Object.keys(lexicon).length.toLocaleString(), 'words')
72 | // console.log(lexicon['ralentir'])
73 | export default lexicon
--------------------------------------------------------------------------------
/tests/numbers/ordinal.test.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from '../_lib.js'
3 | let here = '[number ordinal] '
4 |
5 |
6 | let arr = [
7 | [0, 'zero', 'zeroième'],
8 | // [1, 'un', 'unième'],
9 | [2, 'deux', 'deuxième'],
10 | [3, 'trois', 'troisième'],
11 | [4, 'quatre', 'quatrième'],
12 | [5, 'cinq', 'cinquième'],
13 | [6, 'six', 'sixième'],
14 | [7, 'sept', 'septième'],
15 | [8, 'huit', 'huitième'],
16 | [9, 'neuf', 'neuvième'],
17 |
18 | [10, 'dix', 'dixième'],
19 | [11, 'onze', 'onzième'],
20 | [12, 'douze', 'douzième'],
21 | [13, 'treize', 'treizième'],
22 | [14, 'quatorze', 'quatorzième'],
23 | [15, 'quinze', 'quinzième'],
24 | [16, 'seize', 'seizième'],
25 | [17, 'dix sept', 'dix septième'],
26 | [18, 'dix huit', 'dix huitième'],
27 | [19, 'dix neuf', 'dix neuvième'],
28 |
29 | [20, 'vingt', 'vingtième'],
30 | [30, 'trente', 'trentième'],
31 | [40, 'quarante', 'quarantième'],
32 | [50, 'cinquante', 'cinquantième'],
33 | [60, 'soixante', 'soixantième'],
34 | [70, 'soixante dix', 'soixante dixième'],
35 | [80, 'quatre vingt', 'quatre vingtième'],
36 | [90, 'quatre vingt dix huit', 'quatre vingt dix huitième'],
37 |
38 | [100, 'cent', 'centième'],
39 | [1000, 'mille', 'millième'],
40 | [1000000, 'million', 'millionième'],//million 1000,000
41 | [1000000000, 'milliard', 'milliardième'],//billion 1000,000,000
42 | // [1000000000000, 'mille milliards', 'mille milliardième'],//trillion 1000,000,000
43 |
44 | ]
45 | test('cardinal to ordinal:', function (t) {
46 | arr.forEach(function (a) {
47 | let [_, card, ord] = a
48 | let doc = nlp(card).numbers().toOrdinal()
49 | t.equal(doc.text(), ord, here + ' [toOrdinal] ' + card)
50 | })
51 | t.end()
52 | })
53 | test('ordinal -> cardinal:', function (t) {
54 | arr.forEach(function (a) {
55 | let [, card, ord] = a
56 | let doc = nlp(ord).numbers().toCardinal()
57 | t.equal(doc.text(), card, here + ' [toCardinal] ' + card)
58 | })
59 | t.end()
60 | })
61 |
62 |
63 | test('ordinal fmt:', function (t) {
64 | let list = [
65 | // [1, 'première', '1er'],//'first'
66 | [2, 'deuxième', '2e'],//'second'
67 | [3, 'troisième', '3e'],//'third'
68 | [4, 'quatrième', '4e'],//'fourth'
69 | [5, 'cinquième', '5e'],//'fifth'
70 | [6, 'sixième', '6e'],//'sixth'
71 | [7, 'septième', '7e'],//'seventh'
72 | [8, 'huitième', '8e'],//'eighth'
73 | [9, 'neuvième', '9e'],//'ninth'
74 | [10, 'dixième', '10e'],//'tenth'
75 | ]
76 | list.forEach(function (a) {
77 | let [_, str, want] = a
78 | let m = nlp(str).numbers().toNumber()
79 | t.equal(m.text(), want, here + str)
80 | })
81 | t.end()
82 | })
83 |
--------------------------------------------------------------------------------
/tests/numbers/number-misc.test.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from '../_lib.js'
3 | let here = '[number-misc] '
4 |
5 |
6 | test('num equals', function (t) {
7 | let arr = [
8 | ['un cent', 'cent'],
9 | ['trois cents', 'trois cent'],
10 | ['un million', 'million'],
11 | ['3 cent', 'trois cent'],
12 | ['cinquante', 'cinquantième'],
13 | ['sept', 'septième'],
14 | ['dix huit', 'dix huitième'],
15 | ['moins dix huitième', '-18e'],
16 | ['moins dix huit', '-18'],
17 | ['moins deux centième', '-200'],
18 | ['quatorze cent', 'quatorze centième']
19 | ]
20 | arr.forEach(a => {
21 | let [left, right] = a
22 | left = nlp(left).numbers().get()[0]
23 | right = nlp(right).numbers().get()[0]
24 | t.equal(left, right, here + a.join(' == '))
25 | })
26 | t.end()
27 | })
28 |
29 | test('prefix/suffix:', function (t) {
30 | let doc = nlp('$7,938').numbers().add(1)
31 | t.equal(doc.text(), '$7939', here + 'add money')
32 |
33 | doc = nlp('7,938kg').numbers().minus(1)
34 | t.equal(doc.text(), '7937kg', here + 'minus w/ unit')
35 |
36 | doc = nlp('938.4cm').numbers().minus(1)
37 | t.equal(doc.text(), '937.4cm', here + 'minus w/ decimal')
38 |
39 | doc = nlp('33e').numbers().add(1)
40 | t.equal(doc.text(), '34e', here + 'add ordinal')
41 | t.end()
42 | })
43 |
44 | // test('units-basic:', function (t) {
45 | // let arr = [
46 | // // ['33km', 'km'],
47 | // ['33 km', 'km'],
48 | // ['40,000 mètres', 'mètres'],
49 | // ['1 pouce', 'pouce'],
50 | // ['2 pouces', 'pouces'],
51 | // ['seven hundred litres', 'litres'],
52 | // ['one litre', 'litre'],
53 | // ['0.4 mètre', 'meter'],
54 | // // ['3 km2', 'km2'],
55 | // ['3 km²', 'km²'],
56 | // // ['44 °c', '°c'],
57 | // ]
58 | // arr.forEach(a => {
59 | // let m = nlp(a[0]).numbers().units()
60 | // t.equal(m.out('normal'), a[1], here + a[0])
61 | // })
62 | // t.end()
63 | // })
64 |
65 |
66 | test('plus:', function (t) {
67 | let doc = nlp(`j'ai quatre vingt deux pommes`)
68 | doc.numbers().add(2)
69 | t.equal(doc.text(), `j'ai quatre vingt quatre pommes`, here + 'plus-2')
70 |
71 | doc = nlp(`j'ai moins quarante pommes`)
72 | doc.numbers().add(50)
73 | t.equal(doc.text(), `j'ai dix pommes`, here + 'plus-50')
74 | t.end()
75 | })
76 |
77 | test('minus:', function (t) {
78 | let doc = nlp(`j'ai quarante pommes`)
79 | doc.numbers().minus(50)
80 | t.equal(doc.text(), `j'ai moins dix pommes`, here + 'minus-50')
81 |
82 | doc = nlp(`j'ai moins quarante pommes`)
83 | doc.numbers().minus(50)
84 | t.equal(doc.text(), `j'ai moins quatre vingt dix pommes`, here + 'minus-50')
85 | t.end()
86 | })
--------------------------------------------------------------------------------
/src/02-two/preTagger/methods/guessGender.js:
--------------------------------------------------------------------------------
1 | let masc = new Set(['le', 'un', 'du'])
2 | let femme = new Set(['la', 'une'])
3 |
4 | const femaleEnds = ['anse', 'ette', 'esse', 'ance', 'eine', 'ure', 'ion']
5 | const maleEnds = [
6 | 'age', 'isme', 'eau', 'ment', 'in', 'ou', 'et', 'ege', 'eme', 'ome', 'aume', 'age', 'isme', 'an', 'ent', 'ai', 'out', 'et', 'eu', 'ut', 'is', 'il', 'ex',
7 | 'an', 'and', 'ant', 'ent', 'in', 'int', 'om', 'ond', 'ont', 'eau', 'au', 'aud', 'aut', 'o', 'os', 'ot', 'ai', 'ais', 'ait', 'es', 'et', 'ou', 'out', 'out', 'oux', 'i', 'il', 'it', 'is', 'y', 'at', 'as', 'ois', 'oit', 'u', 'us', 'ut',
8 | 'eu', 'er', 'cé', 'age', 'ege', 'ème', 'ome', 'aume', 'isme', 'as', 'is', 'os', 'us', 'ex', 'it', 'est', 'al', 'el', 'il', 'ol', 'eul', 'all', 'if', 'ef', 'ac', 'ic', 'oc', 'uc', 'am', 'um', 'en', 'air', 'er',
9 | 'erf', 'ert', 'ar', 'arc', 'ars', 'art', 'our', 'ours', 'or', 'ord', 'ors', 'ort', 'ir', 'oir', 'eur', 'ail', 'eil', 'euil', 'ueil', 'ing',
10 | ]
11 |
12 |
13 | const suffixGuess = function (term) {
14 | let str = term.normal
15 | str = str.replace(/s$/, '')
16 | if (femaleEnds.find(suff => str.endsWith(suff))) {
17 | return 'FemaleNoun'
18 | }
19 | if (maleEnds.find(suff => str.endsWith(suff))) {
20 | return 'MaleNoun'
21 | }
22 | return null
23 | }
24 |
25 | const fallback = function (term) {
26 | let str = term.normal
27 | if (str.endsWith('e') || str.endsWith('es')) {
28 | return 'FemaleNoun'
29 | }
30 | return null //-?
31 | }
32 |
33 | const lookLeft = function (terms, i) {
34 | for (let n = 1; n < 3; n += 1) {
35 | if (!terms[i - n]) {
36 | return null
37 | }
38 | let term = terms[i - n]
39 | if (masc.has(term.normal)) {
40 | return 'MaleNoun'
41 | }
42 | if (femme.has(term.normal)) {
43 | return 'FemaleNoun'
44 | }
45 | }
46 | return null
47 | }
48 |
49 | // look for a gendered adjective
50 | const lookRight = function (terms, i) {
51 | for (let n = 1; n < 2; n += 1) {
52 | if (!terms[i + n]) {
53 | return null
54 | }
55 | let term = terms[i + n]
56 | if (term.tags.has('MaleAdjective')) {
57 | return 'MaleNoun'
58 | }
59 | if (term.tags.has('FemaleAdjective')) {
60 | return 'FemaleNoun'
61 | }
62 | }
63 | return null
64 | }
65 |
66 | const guessGender = function (terms, i) {
67 | let { tags } = terms[i]
68 | if (!tags.has('Noun')) {
69 | return null
70 | }
71 | if (tags.has('MaleNoun')) {
72 | return 'MaleNoun'
73 | }
74 | if (tags.has('FemaleNoun')) {
75 | return 'FemaleNoun'
76 | }
77 | let found = lookLeft(terms, i)
78 | found = found || lookRight(terms, i)
79 | found = found || suffixGuess(terms[i])
80 | found = found || fallback(terms[i])
81 | return found
82 | }
83 | export default guessGender
--------------------------------------------------------------------------------
/plugins/dates/tests/backburner/equals.ignore.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from './_lib.js'
3 |
4 | const context = {
5 | today: '2020-01-21',
6 | timezone: 'Canada/Pacific',
7 | }
8 |
9 | const arr = [
10 | // explicit-dates
11 | [`march 2nd`, '2020-03-02T00:00:00.000-08:00'],
12 | [`2 march`, '2020-03-02T00:00:00.000-08:00'],
13 | [`tues march 2`, '2020-03-02T00:00:00.000-08:00'],
14 | [`march the second`, '2020-03-02T00:00:00.000-08:00'],
15 | [`on the 2nd of march`, '2020-03-02T00:00:00.000-08:00'],
16 |
17 | // numerical-dates
18 | [`1999/03/02`, 'march 2 1999'],
19 | [`1999-03-02`, 'march 2 1999'],
20 | [`03-02-1999`, 'march 2nd 1999'],
21 | [`03/02`, 'march 2'],
22 | [`2015.08.13`, 'aug 13 2015'],
23 |
24 | // named-dates
25 | [`today`, '2020-01-21'],
26 | [`now`, 'right now'],
27 | [`q1`, 'jan 1'],
28 | [`tomorrow`, '2020-01-22'],
29 |
30 | // time
31 | [`2pm`, '2020-01-21T14:00:00.000-08:00'],
32 | [`2:12pm`, '2020-01-21T14:12:00.000-08:00'],
33 | [`2pm eastern time`, '2020-01-21T14:00:00.000-05:00'],
34 | [`2:12 in the evening`, '2020-01-21T14:12:00.000-08:00'],
35 | [`02:12:00am`, '2020-01-21T02:12:00.000-08:00'],
36 | [`2 oclock am`, '2020-01-21T02:00:00.000-08:00'],
37 | [`noon`, 'today at 12pm'],
38 | [`at night`, 'today at 8:00pm'],
39 | [`in the morning`, 'tomorrow at 8:00pm'],
40 | [`tomorrow evening`, 'Jan 22 6pm'],
41 | [`aug-20`, '20-aug'],
42 | [`in a few years`, `in 3 years`],
43 | [`in a couple years`, `in 2 years`],
44 | [`2 weeks back`, `2 weeks ago`],
45 | [`last q1`, `q1 2019`],
46 | [`last q2`, `q2 2019`],
47 | [`last q3`, `q3 2019`],
48 | [`last q4`, `q4 2019`],
49 | [`this q1`, `q1 2020`],
50 | [`this q2`, `q2 2020`],
51 | [`this q3`, `q3 2020`],
52 | [`this q4`, `q4 2020`],
53 | [`next q1`, `q1 2021`],
54 | [`next q2`, `q2 2021`],
55 | [`next q3`, `q3 2021`],
56 | [`next q4`, `q4 2021`],
57 | [`tuesday at 3`, `tuesday 3:00pm`],
58 | [`tuesday at 4:00`, `tuesday 4:00pm`],
59 | [`5:30`, `today at 5:30pm`],
60 | [`tuesday at 3am`, `tuesday 3:00am`],
61 | [`5 oclock`, `today at 5:00pm`],
62 | [`5 oclock am`, `today at 5:00am`],
63 | [`10 oclock`, `today at 10:00am`],
64 | [`11:30`, `today at 11:30am`],
65 | [`11:30pm`, `today at 11:30pm`],
66 | [`tuesday at 1`, `tuesday at 1pm`],
67 | ['this fri, monday', 'fri jan 24 and mon jan 27'],
68 | ['next friday, this monday', 'fri jan 31 and mon jan 27'],
69 | ]
70 |
71 | test('date-variety', function (t) {
72 | arr.forEach((a) => {
73 | let left = nlp(a[0]).dates(context).json()[0] || {}
74 | let right = nlp(a[1]).dates(context).json()[0] || {}
75 | left.date = left.date || {}
76 | right.date = right.date || {}
77 | t.equal(left.date.start, right.date.start, a[0])
78 | })
79 | t.end()
80 | })
81 |
--------------------------------------------------------------------------------
/src/02-two/tagset/tags/verbs.js:
--------------------------------------------------------------------------------
1 | export default {
2 | Verb: {
3 | not: ['Noun', 'Adjective', 'Adverb', 'Value', 'Expression'],
4 | },
5 | PresentTense: {
6 | is: 'Verb',
7 | not: ['PastTense'],
8 | },
9 | Infinitive: {
10 | is: 'PresentTense',
11 | not: ['Gerund'],
12 | },
13 | Imperative: {
14 | is: 'Infinitive',
15 | },
16 | Gerund: {
17 | is: 'PresentTense',
18 | not: ['Copula'],
19 | },
20 | PastTense: {
21 | is: 'Verb',
22 | not: ['PresentTense', 'Gerund'],
23 | },
24 | Copula: {
25 | is: 'Verb',
26 | },
27 | Modal: {
28 | is: 'Verb',
29 | not: ['Infinitive'],
30 | },
31 | PerfectTense: {
32 | is: 'Verb',
33 | not: ['Gerund'],
34 | },
35 | Pluperfect: {
36 | is: 'Verb',
37 | },
38 | Participle: {
39 | is: 'PastTense',
40 | },
41 | PhrasalVerb: {
42 | is: 'Verb',
43 | },
44 | Passive: {
45 | is: 'PastTense',
46 | },
47 | Particle: {
48 | is: 'PhrasalVerb',
49 | not: ['PastTense', 'PresentTense', 'Copula', 'Gerund'],
50 | },
51 | Auxiliary: {
52 | is: 'Verb',
53 | not: ['PastTense', 'PresentTense', 'Gerund', 'Conjunction'],
54 | },
55 |
56 | // french verb forms
57 | PresentParticiple: {
58 | is: 'PresentTense',
59 | not: ['PastTense', 'FutureTense'],
60 | },
61 | PastParticiple: {
62 | is: 'PastTense',
63 | not: ['PresentTense', 'FutureTense'],
64 | },
65 | // [only formal] parlai, parlâmes
66 | PastSimple: {
67 | is: 'PastTense',
68 | not: ['PresentTense', 'FutureTense'],
69 | },
70 | ConditionalVerb: {
71 | is: 'Verb',
72 | },
73 | FutureTense: {
74 | is: 'Verb',
75 | not: ['PresentTense', 'PastTense', 'Gerund'],
76 | },
77 |
78 | //
79 | FirstPerson: {
80 | is: 'Verb',
81 | not: ['SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural']
82 | },
83 | SecondPerson: {
84 | is: 'Verb',
85 | not: ['FirstPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural']
86 | },
87 | ThirdPerson: {
88 | is: 'Verb',
89 | not: ['FirstPerson', 'SecondPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural']
90 | },
91 | FirstPersonPlural: {
92 | is: 'Verb',
93 | not: ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'SecondPersonPlural', 'ThirdPersonPlural']
94 | },
95 | SecondPersonPlural: {
96 | is: 'Verb',
97 | not: ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'ThirdPersonPlural']
98 | },
99 | ThirdPersonPlural: {
100 | is: 'Verb',
101 | not: ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural']
102 | },
103 | }
104 |
--------------------------------------------------------------------------------
/learn/giga/corpus.js:
--------------------------------------------------------------------------------
1 | import { forEachSync } from './_giga.js'
2 | import doSentences from './french.js'
3 | import fs from 'fs'
4 |
5 |
6 | let ids = []
7 | for (let i = 1; i <= 10; i += 1) {
8 | let str = String(i).padStart(4, '0')
9 | ids.push(str)
10 | }
11 | // ids = ['0004']
12 |
13 | let tagMap = {
14 | 'ABR': 'Abbreviation',//abbreviation
15 | 'ADJ': 'Adjective',//adjective
16 | 'ADV': 'Adverb',//adjective
17 | 'DET:ART': 'Determiner',//article
18 | 'DET:POS': 'Pronoun',//possessive pronoun (ma, ta, ...)
19 | 'INT': 'Interjection',//interjection
20 | 'KON': 'Conjunction',//conjunction
21 | 'NAM': 'ProperNoun',//proper name
22 | 'NOM': 'Noun',//noun
23 | 'NUM': 'Value',//numeral
24 | 'PRO': 'Pronoun',//pronoun
25 | 'PRO:DEM': 'Pronoun',//demonstrative pronoun
26 | 'PRO:IND': 'Pronoun',//indefinite pronoun
27 | 'PRO:PER': 'Pronoun',//personal pronoun
28 | 'PRO:POS': 'Pronoun',//possessive pronoun (mien, tien, ...)
29 | 'PRO:REL': 'Pronoun',//relative pronoun
30 | 'PRP': 'Preposition',//preposition
31 | 'PRP:det': 'Preposition',//preposition plus article (au,du,aux,des)
32 | // 'PUN':'',//punctuation
33 | // 'PUN:cit':'',//punctuation citation
34 | // 'SENT':'',//sentence tag
35 | // 'SYM':'',//symbol
36 | 'VER:cond': 'Verb',//verb conditional
37 | 'VER:futu': 'Verb',//verb futur
38 | 'VER:impe': 'Verb',//verb imperative
39 | 'VER:impf': 'Verb',//verb imperfect
40 | 'VER:infi': 'Verb',//verb infinitive
41 | 'VER:pper': 'Verb',//verb past participle
42 | 'VER:ppre': 'Verb',//verb present participle
43 | 'VER:pres': 'Verb',//verb present
44 | 'VER:simp': 'Verb',//verb simple past
45 | 'VER:subi': 'Verb',//verb subjunctive imperfect
46 | 'VER:subp': 'Verb',//verb subjunctive present
47 | }
48 |
49 | let byTag = {
50 | Verb: {},
51 | Noun: {},
52 | Adjective: {},
53 | Adverb: {},
54 | }
55 | const doBoth = function (both) {
56 | both.fr.forEach((term, i) => {
57 | let tag = tagMap[term['$'].pos]
58 | let str = term['$text'].toLowerCase()
59 | if (tag && byTag[tag]) {
60 | byTag[tag][str] = byTag[tag][str] || 0
61 | byTag[tag][str] += 1
62 | }
63 | })
64 | }
65 | await forEachSync(ids, async id => {
66 | try {
67 | console.log(`\ndoing ${id}:\n`)
68 | await doSentences(id, doBoth)
69 | } catch (e) {
70 | console.log(e)
71 | }
72 | })
73 |
74 | const doTag = function (tag, max = 6) {
75 | let all = Object.entries(byTag[tag])
76 | all = all.filter(a => a[1] > max)
77 | all = all.sort((a, b) => {
78 | if (a[1] > b[1]) {
79 | return -1
80 | } else if (a[1] < b[1]) {
81 | return 1
82 | }
83 | return 0
84 | })
85 | all = all.map(a => a[0])
86 | fs.writeFileSync(`./${tag}.js`, 'export default ' + JSON.stringify(all, null, 2))
87 | return all
88 | }
89 | doTag('Adverb')
90 | doTag('Verb')
91 | doTag('Noun')
92 | doTag('Adjective')
93 | // console.dir(byTag, { depth: 5 })
--------------------------------------------------------------------------------
/plugins/dates/tests/backburner/to-iso.ignore.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from './_lib.js'
3 |
4 | const context = {
5 | today: '2019-02-02T03:40:00.000Z',
6 | timezone: 'UTC',
7 | }
8 |
9 | let arr = [
10 | ['june 5th 1999', '1999-06-05T00:00:00.000Z'],
11 | ['june 5th 1999', '1999-06-05T00:00:00.000Z'],
12 | ['january 1st 1644', '1644-01-01T00:00:00.000Z'],
13 | ['jan 1st 1644', '1644-01-01T00:00:00.000Z'],
14 | ['June 4th 1993', '1993-06-04T00:00:00.000Z'],
15 | ['March 1st 1987', '1987-03-01T00:00:00.000Z'],
16 | ['June 22nd 2014', '2014-06-22T00:00:00.000Z'],
17 | ['may 22nd 2014', '2014-05-22T00:00:00.000Z'],
18 | ['sep 22nd 2014', '2014-09-22T00:00:00.000Z'],
19 | ['apr 22nd 2014', '2014-04-22T00:00:00.000Z'],
20 | ['June 22nd 1997', '1997-06-22T00:00:00.000Z'],
21 | ['january 5th 1998', '1998-01-05T00:00:00.000Z'],
22 | ['3rd of March 1969', '1969-03-03T00:00:00.000Z'],
23 | ['2nd of April 1929', '1929-04-02T00:00:00.000Z'],
24 | ['2nd of jul 1929', '1929-07-02T00:00:00.000Z'],
25 | ['March 1969', '1969-03-01T00:00:00.000Z'],
26 | ['jan 1921', '1921-01-01T00:00:00.000Z'],
27 | ['March 18th', '2019-03-18T00:00:00.000Z'],
28 | ['August 28th', '2019-08-28T00:00:00.000Z'],
29 | ['18th of March', '2019-03-18T00:00:00.000Z'],
30 | ['27th of March', '2019-03-27T00:00:00.000Z'],
31 | ['february 10th', '2019-02-10T00:00:00.000Z'],
32 | ['february 28th', '2019-02-28T00:00:00.000Z'],
33 | ['first day of 2019', '2019-01-01T00:00:00.000Z'],
34 | ['last day of 2019', '2019-12-31T00:00:00.000Z'],
35 | ['7th hour of 2019', '2019-01-01T06:00:00.000Z'],
36 | ['7th day of 2019', '2019-01-07T00:00:00.000Z'],
37 | ['second quarter of 2019', '2019-04-01T00:00:00.000Z'],
38 | ['30th minute of 2019', '2019-01-01T00:30:00.000Z'],
39 | ['2019', '2019-01-01T00:00:00.000Z'],
40 | ['2028', '2028-01-01T00:00:00.000Z'],
41 | ['in 2028', '2028-01-01T00:00:00.000Z'],
42 | ['2nd month in 2028', '2028-02-01T00:00:00.000Z'],
43 | ['first day of march 2019', '2019-03-01T00:00:00.000Z'],
44 | ['5th day of march 2019', '2019-03-05T00:00:00.000Z'],
45 | ['5th day of q1 2002', '2002-01-05T00:00:00.000Z'],
46 | ['5th hour of March 3rd 2002', '2002-03-03T04:00:00.000Z'],
47 | ['last hour of March 2021', '2021-03-31T23:00:00.000Z'],
48 | ['may to august 1996', '1996-05-01T00:00:00.000Z'],
49 | ['half past 4', '2019-02-02T16:30:00.000Z'],
50 | ['20 past 2', '2019-02-02T14:20:00.000Z'],
51 | ['at 20 past', '2019-02-02T04:20:00.000Z'],
52 | ['at half past', '2019-02-02T04:30:00.000Z'],
53 | ['at quarter to', '2019-02-02T03:45:00.000Z'],
54 | ['at quarter after', '2019-02-02T04:15:00.000Z'],
55 | // ['august to may 1996', '1996-05-01T00:00:00.000Z'],
56 | ]
57 |
58 | test('date-parse :', function (t) {
59 | arr.forEach(function (a) {
60 | let json = nlp(a[0]).dates(context).json()[0] || {}
61 | t.equal(json.dates.start, a[1], a[0])
62 | })
63 | t.end()
64 | })
65 |
--------------------------------------------------------------------------------
/src/03-three/numbers/parse/fromText.js:
--------------------------------------------------------------------------------
1 | import { toCardinal, toNumber } from './_data.js'
2 |
3 | const multiLeft = {
4 | dix: true,//dix huit
5 | soixante: true,//soixante dix
6 | quatre: true,//quatre vingt
7 | mille: true//mille milliards
8 | }
9 |
10 | const multiples = {
11 | // cent: 100,//hundred
12 | mille: 1000,//thousand
13 | milles: 1000,//thousand
14 | million: 1000000,//million
15 | millions: 1000000,//million
16 | milliards: 1000000000//billion
17 | }
18 |
19 | // greedy scan for multi-word numbers, like 'quatre vingt'
20 | const scanAhead = function (terms, i) {
21 | let skip = 0
22 | let add = 0
23 | let words = []
24 | for (let index = 0; index < 3; index += 1) {
25 | if (!terms[i + index]) {
26 | break
27 | }
28 | let w = terms[i + index].normal || ''
29 | if (toCardinal.hasOwnProperty(w)) {
30 | w = toCardinal[w]
31 | }
32 | words.push(w)
33 | let str = words.join(' ')
34 | if (toNumber.hasOwnProperty(str)) {
35 | skip = index
36 | add = toNumber[str]
37 | }
38 | }
39 | return { skip, add }
40 | }
41 |
42 | const parseNumbers = function (terms = []) {
43 | let sum = 0
44 | let carry = 0
45 | let minus = false
46 | let sums = []
47 | for (let i = 0; i < terms.length; i += 1) {
48 | let { tags, normal } = terms[i]
49 | let w = normal || ''
50 | if (w === 'moins') {
51 | minus = true
52 | continue
53 | }
54 | // ... et-un
55 | if (w === 'et') {
56 | continue
57 | }
58 | // 'huitieme'
59 | if (tags.has('Ordinal')) {
60 | w = toCardinal[w]
61 | }
62 | // add thousand, million
63 | if (multiples.hasOwnProperty(w)) {
64 | sum += carry
65 | carry = 0
66 | if (!sum) {
67 | sum = 1
68 | }
69 | sum *= multiples[w]
70 | sums.push(sum)
71 | sum = 0
72 | continue
73 | }
74 | // support 'quatre vingt dix', etc
75 | if (multiLeft.hasOwnProperty(w)) {
76 | let { add, skip } = scanAhead(terms, i)
77 | if (skip > 0) {
78 | carry += add
79 | i += skip
80 | continue
81 | }
82 | }
83 |
84 | // 'cent'
85 | if (tags.has('Multiple')) {
86 | let mult = toNumber[w] || 1
87 | if (carry === 0) {
88 | carry = 1
89 | }
90 | sum += mult * carry
91 | carry = 0
92 | continue
93 | }
94 | // 'trois'
95 | if (toNumber.hasOwnProperty(w)) {
96 | carry += toNumber[w]
97 | } else {
98 | let n = Number(w)
99 | if (n) {
100 | carry += n
101 | } else {
102 | // console.log('missing', w) //TODO: fixme
103 | }
104 | }
105 | }
106 | // include any remaining
107 | if (carry !== 0) {
108 | sum += carry
109 | }
110 | sums.push(sum)
111 | sum = sums.reduce((h, n) => {
112 | return h + n
113 | }, 0)
114 | if (minus === true) {
115 | sum *= -1
116 | }
117 | return sum
118 | }
119 | export default parseNumbers
--------------------------------------------------------------------------------
/data/lexicon/index.js:
--------------------------------------------------------------------------------
1 | //directory of files to pack with `node scripts/pack.js`
2 | //they are stored in compressed form
3 | import lex from './misc.js'
4 |
5 | import firstnames from './people/firstnames.js'
6 | import lastnames from './people/lastnames.js'
7 | import maleNames from './people/maleNames.js'
8 | import femaleNames from './people/femaleNames.js'
9 | import honorifics from './people/honorifics.js'
10 | import people from './people/people.js'
11 |
12 | import countries from './places/countries.js'
13 | import regions from './places/regions.js'
14 | import places from './places/places.js'
15 | import cities from './places/cities.js'
16 |
17 | import cardinals from './numbers/cardinals.js'
18 | import ordinals from './numbers/ordinals.js'
19 | import units from './numbers/units.js'
20 |
21 | import infinitives from './verbs/infinitives.js'
22 |
23 | import masculine from './nouns/masculine.js'
24 | import feminine from './nouns/feminine.js'
25 | import sportsTeams from './nouns/sportsTeams.js'
26 | import organizations from './nouns/organizations.js'
27 | import possessives from './nouns/possessives.js'
28 | import pronouns from './nouns/pronouns.js'
29 | import uncountables from './nouns/uncountables.js'
30 | import nouns from './nouns/nouns.js'
31 |
32 | import masc from './adjectives/masc.js'
33 |
34 | import dates from './dates/dates.js'
35 | import months from './dates/months.js'
36 | import weekdays from './dates/weekdays.js'
37 |
38 | import adverbs from './misc/adverbs.js'
39 | import conjunctions from './misc/conjunctions.js'
40 | import currencies from './misc/currencies.js'
41 | import expressions from './misc/expressions.js'
42 | import determiners from './misc/determiners.js'
43 | import prepositions from './misc/prepositions.js'
44 | //add-in the generic, flat word-lists
45 | const data = [
46 | [firstnames, 'FirstName'],
47 | [lastnames, 'LastName'],
48 | [maleNames, 'MaleName'],
49 | [femaleNames, 'FemaleName'],
50 | [honorifics, 'Honorific'],
51 | [people, 'Person'],
52 |
53 | [countries, 'Country'],
54 | [regions, 'Region'],
55 | [places, 'Place'],
56 | [cities, 'City'],
57 |
58 | [cardinals, 'Cardinal'],
59 | [ordinals, 'Ordinal'],
60 | [units, 'Unit'],
61 |
62 | [infinitives, 'Infinitive'],
63 |
64 | [masculine, 'MaleNoun'],
65 | [feminine, 'FemaleNoun'],
66 | [sportsTeams, 'SportsTeam'],
67 | [organizations, 'Organization'],
68 | [possessives, 'Possessive'],
69 | [pronouns, 'Pronoun'],
70 | [uncountables, 'Uncountable'],
71 | [nouns, 'Noun'],
72 |
73 | [masc, 'MaleAdjective'],
74 |
75 | [adverbs, 'Adverb'],
76 | [conjunctions, 'Conjunction'],
77 | [currencies, 'Currency'],
78 | [expressions, 'Expression'],
79 | [determiners, 'Determiner'],
80 | [prepositions, 'Preposition'],
81 |
82 | [dates, 'Date'],
83 | [months, 'Month'],
84 | [weekdays, 'WeekDay'],
85 | ]
86 | for (let i = 0; i < data.length; i++) {
87 | const list = data[i][0]
88 | for (let o = 0; o < list.length; o++) {
89 | //log duplicates
90 | // if (lex[list[o]]) {
91 | // console.log(list[o] + ' ' + lex[list[o]] + ' ' + data[i][1])
92 | // }
93 | lex[list[o]] = data[i][1]
94 | }
95 | }
96 |
97 | export default lex
98 | // console.log(Object.keys(lex).length);
99 | // console.log(lex['mars'])
100 |
--------------------------------------------------------------------------------
/scratch.js:
--------------------------------------------------------------------------------
1 | import nlp from './src/index.js'
2 | nlp.verbose('tagger')
3 | /*
4 |
5 | */
6 |
7 |
8 | // console.log(nlp('essayer').verbs().conjugate())
9 |
10 | let root = 'errer'
11 | let arr = [
12 | // mauvais
13 | // 'Elle a eu une mauvaise expérience',
14 | // devenir
15 | // 'Elle est devenue une célèbre', //passe-compose
16 |
17 | // bénir
18 | // 'Que Dieu te bénisse avec bonheur', //subjunctive
19 |
20 | // revendiquer
21 | // 'Il revendiqua avoir vu un OVNI.', //passe-simple
22 |
23 | // accroupir
24 | // `Elle s'est accroupie derrière l'arbre`, //passe anterior
25 |
26 |
27 | // ménage
28 | // `Les tâches ménagères `,
29 |
30 | // nier
31 | // `la nouvelle loi nierait leurs droits`, //conditional
32 |
33 | // vieux
34 | // `La vieille maison`,
35 | // `une collection de photographies`,
36 |
37 | // promouvoir
38 | // `Elle a été promue à un poste`, //
39 |
40 | // pleuvoir
41 | // `quand il pleut `,
42 |
43 | // refléter
44 | // `Je réfléchis toujours`, //?
45 |
46 | // rôtir
47 | // `Elle a rôti une dinde`, //passe compose
48 |
49 |
50 | // soupirer
51 | // `Elle soupira `, //passe simple
52 |
53 | // envoler
54 | // `La montgolfière au-dessus des montagnes`,
55 |
56 | // // chanceler
57 | // `Il chez lui `,
58 |
59 |
60 | // épais
61 | // `une couverture épaisse`,
62 |
63 | // essayer
64 | // `Elle essaie de parler `,
65 |
66 | // errer
67 | `Le vieil homme et se perdit.`, //passe simple
68 | // ["devenir", "become", "Verb", "She a famous singer after years of practice.", "Elle est devenue une célèbre chanteuse après des années de pratique."],
69 | // ["accroupir", "crouch", "Verb", "She behind the tree to hide.", "Elle s'est accroupie derrière l'arbre pour se cacher."],
70 |
71 | // ["endormi", "asleep", "Adjective", "I love listening to music while falling .", "J'aime écouter de la musique en m'endormant."],
72 | // ["mauvais", "bad", "Adjective", "She had a experience with her previous boss.", "Elle a eu une mauvaise expérience avec son ancien patron."],
73 | // ["épais", "thick", "Adjective", "The book has a cover.", "Le livre a une couverture épaisse."],
74 |
75 |
76 |
77 | // ['Il pêche la truite tous', 'pêcher'],
78 | // [`L'équipe a été vaincue lors du match final`, 'vaincre'],
79 | // ['', ''],
80 | // 'accroupir',
81 |
82 | // 'Il abrégera son nom ',
83 | // 'marcher',
84 | // 'ralentir',
85 | // 'vendre',
86 | // 'hier',
87 | // // 'célèbre',
88 | // // 'très délicieux ',
89 | // 'Le gâteau était très délicieux ',
90 | // 'j\'ai lu trois livres',
91 | // `nous détestons le sable`,
92 | // `deuxième`,
93 | // 'vieillir',
94 | // 'envahir',
95 | // 'réfléchir',
96 | // 'des coûts « démontre que le gouvernement »',
97 | ]
98 | // let [fr, en, pos, enTxt, frTxt] = arr[0]
99 |
100 | // console.log(fr, pos)
101 | let doc = nlp(arr[0]).debug()
102 | doc.match(`{${root}}`).debug()
103 | console.log(nlp(root).verbs().conjugate())
104 |
105 | // console.log(doc.verbs().conjugate())
106 | // doc.verbs().toPastTense().debug()
107 | // doc.numbers().toNumber()
108 | // doc.debug()
109 |
110 |
111 | // let doc = nlp('4th sept')
112 | // let m = doc.match('[#Value] [#Month]')
113 | // m.debug()
114 | // m.groups().date.debug()
115 | // m.groups().month.debug()
--------------------------------------------------------------------------------
/src/01-one/lexicon/methods/verb/index.js:
--------------------------------------------------------------------------------
1 | import { convert, reverse } from 'suffix-thumb'
2 | import model from '../model.js'
3 |
4 | // ---verbs--
5 | const reverseAll = function (obj) {
6 | return Object.keys(obj).reduce((h, k) => {
7 | h[k] = reverse(obj[k])
8 | return h
9 | }, {})
10 | }
11 |
12 | const doVerb = function (str, m) {
13 | return {
14 | first: convert(str, m.je),
15 | second: convert(str, m.tu),
16 | third: convert(str, m.il),
17 | firstPlural: convert(str, m.nous),
18 | secondPlural: convert(str, m.vous),
19 | thirdPlural: convert(str, m.ils),
20 | }
21 | }
22 | const doOneVerb = function (str, form, m) {
23 | if (form === 'FirstPerson') {
24 | return convert(str, m.je)
25 | }
26 | if (form === 'SecondPerson') {
27 | return convert(str, m.tu)
28 | }
29 | if (form === 'ThirdPerson') {
30 | return convert(str, m.il)
31 | }
32 | if (form === 'FirstPersonPlural') {
33 | return convert(str, m.nous)
34 | }
35 | if (form === 'SecondPersonPlural') {
36 | return convert(str, m.vous)
37 | }
38 | if (form === 'ThirdPersonPlural') {
39 | return convert(str, m.ils)
40 | }
41 | return str
42 | }
43 |
44 | const toPresentTense = (str) => doVerb(str, model.presentTense)
45 | const toFutureTense = (str) => doVerb(str, model.futureTense)
46 | const toImperfect = (str) => doVerb(str, model.imperfect)
47 | const toPastParticiple = (str) => convert(str, model.pastParticiple.prt)
48 |
49 | const fromPresent = reverseAll(model.presentTense)
50 | const fromPresentTense = (str, form) => doOneVerb(str, form, fromPresent)
51 |
52 | const fromFuture = reverseAll(model.futureTense)
53 | const fromFutureTense = (str, form) => doOneVerb(str, form, fromFuture)
54 |
55 | const fromImperfect = reverseAll(model.imperfect)
56 | const fromImperfectTense = (str, form) => doOneVerb(str, form, fromImperfect)
57 |
58 | const fromParticiple = reverse(model.pastParticiple.prt)
59 | const fromPastParticiple = (str) => convert(str, fromParticiple)
60 |
61 | // do this one manually
62 | const fromPassive = function (str) {
63 | str = str.replace(/ées$/, 'er')
64 | str = str.replace(/ée$/, 'er')
65 | str = str.replace(/és$/, 'er')
66 | str = str.replace(/é$/, 'er')
67 | return str
68 | }
69 |
70 | // i don't really know how this works
71 | const toPassive = function (str) {
72 | if (str.endsWith('er')) {
73 | return [
74 | str.replace(/er$/, 'ées'),
75 | str.replace(/er$/, 'ée'),
76 | str.replace(/er$/, 'és'),
77 | str.replace(/er$/, 'é'),
78 | ]
79 | }
80 | return []
81 | }
82 |
83 | // an array of every inflection, for '{inf}' syntax
84 | const all = function (str) {
85 | let arr = [str].concat(
86 | Object.values(toPresentTense(str)),
87 | Object.values(toFutureTense(str)),
88 | Object.values(toImperfect(str)),
89 | toPassive(str)
90 | )
91 | arr.push(toPastParticiple(str))
92 | arr = arr.filter(s => s)
93 | arr = new Set(arr)
94 | return Array.from(arr)
95 | }
96 |
97 | export default {
98 | all,
99 | toPresentTense, toFutureTense, toImperfect, toPastParticiple,
100 | fromPresentTense, fromFutureTense, fromImperfectTense, fromPastParticiple, fromPassive
101 | }
102 |
103 | // console.log(presentTense('marcher'))
104 | // console.log(futureTense('marcher'))
105 | // console.log(imperfect('marcher'))
106 | // console.log(pastParticiple('marcher'))
107 | // console.log(noun('roche'))
108 | // console.log(adjective('gentil'))
--------------------------------------------------------------------------------
/plugins/dates/tests/dates.test.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from './_lib.js'
3 | let here = '[fr-dates] '
4 |
5 | //yep,
6 | let jan = '01'
7 | let feb = '02'
8 | let mar = '03'
9 | let apr = '04'
10 | let may = '05'
11 | let june = '06'
12 | let july = '07'
13 | let august = '08'
14 | let sept = '09'
15 | let oct = '10'
16 | let nov = '11'
17 | let dec = '12'
18 | const today = [1998, 2, 2]
19 | const opts = { timezone: 'UTC', today }
20 |
21 | const arr = [
22 | [`je suis né le 2 septembre 1982`, [1982, sept, 2]],
23 | [`Je travaille jusqu'en juin.`, [1998, 3, 2], [1998, june, 1]],
24 | [`Il n'y a pas d'augmentation prévue jusqu'en 2032`, [2032, jan, 1]],
25 | [`Je suis en vacances jusqu'au 3 janvier.`, [1998, jan, 3]],
26 | [`Je peux t'emprunter ta voiture jusqu'à lundi prochain`, [1998, feb, 17]],
27 | ['Nous avons acheté la maison le 15 avril 2013.', [2013, apr, 15]],
28 | ['Le 1er mai est un jour férié en France', [1998, may, 1]],
29 | ['Je vais y aller le premier décembre 2014.', [2014, dec, 1]],
30 | [`le 8 aout 2014.`, [2014, august, 8]],
31 | [`Aujourd'hui, c'est le 8 septembre 2024.`, [2024, sept, 8]],
32 | [`Nous sommes le 1er février aujourd'hui.`, [1998, feb, 1]],
33 | [`Nous sommes le vendredi 1er février aujourd'hui`, [1998, feb, 1]],
34 | ['15/12/2020', [2020, dec, 15]],
35 | ['5/2/2020', [2020, feb, 5]],
36 | ['12/01/2018', [2018, jan, 12]],
37 | // ['01/13/2018', [2018, jan, 13]],
38 | ['Le 6 avril', [1998, apr, 6]],
39 | ['Mercredi 11 mars', [1998, mar, 11]],
40 | ['12/06/2020', [2020, june, 12]],
41 | ['Halloween est le 31 octobre.', [1998, oct, 31]],
42 | [`C'est le quatorze juillet.`, [1998, july, 14]],
43 | [`c'est le premier janvier`, [1998, jan, 1]],
44 | ['le 5 juin 2012', [2012, june, 5]],
45 | ['Juin 5, 2012', [2012, june, 5]],
46 | ['6/5/2012', [2012, may, 6]],
47 | ['le 25 décembre 2012', [2012, dec, 25]],
48 | ['December 25, 2012', [2012, dec, 25]],
49 | ['12/15/2012', [2012, dec, 15]],
50 | ['le 3 novembre 2012', [2012, nov, 3]],
51 | ['Novembre 3, 2021', [2021, nov, 3]], // have 2 years in slug
52 | ['3/11/21', [2021, nov, 3]],
53 | ['entre sept et oct', [1998, sept, 1], [1998, oct, 1]],
54 | ['demain à 10h', [1998, feb, 3]], // tomorrow at 10am
55 | ['lundi 20', [1998, apr, 20]], // next monday 20th
56 | ['lundi 20 à 10h', [1998, apr, 20]], // next monday 20th at 10am
57 | ['hier soir', [1998, feb, 12]], // yesterday evening
58 | ['semaine prochaine', [1998, feb, 17]], // next week
59 | ['14h30 demain', [1998, feb, 3]], // 2:30pm tomorow
60 | ['demain matin à 9h', [1998, feb, 3]], // tomorrow morning at 9am
61 | ['hier après-midi', [1998, feb, 1]], //yesterday afternoon
62 | ]
63 |
64 | const padZero = num => String(num).padStart(2, '0')
65 |
66 | test('dates:', function (t) {
67 | arr.forEach(a => {
68 | let [str, start, end] = a
69 | // make them ISOs
70 | start = start.map(padZero).join('-')
71 | end = end || []
72 | end = end.map(padZero).join('-')
73 |
74 | let doc = nlp(str)
75 | // t.equal(doc.has('#Date'), true, here + `has-date: '${str}'`)
76 |
77 | let json = doc.dates(opts).json({ terms: false })[0] || { date: [] }
78 | let dates = json.dates[0] || { start: '', end: '' }
79 |
80 | // test the start date is the ISO
81 | let iso = dates.start.replace(/T00:00:00\.000Z$/, '', '')
82 | t.equal(iso, start, here + `[start]: ${str}`)
83 | // test the end date is the ISO
84 | if (end) {
85 | iso = dates.end.replace(/T.*$/, '', '')
86 | t.equal(iso, end, `[end]: ${str}`)
87 | }
88 | })
89 | t.end()
90 | })
--------------------------------------------------------------------------------
/data/lexicon/places/regions.js:
--------------------------------------------------------------------------------
1 | //some major 'second-level' administrative divisions
2 | export default [
3 | 'alabama',
4 | 'alaska',
5 | 'arizona',
6 | 'arkansas',
7 | 'california',
8 | 'colorado',
9 | 'connecticut',
10 | 'delaware',
11 | 'florida',
12 | 'georgia',
13 | 'hawaii',
14 | 'idaho',
15 | 'illinois',
16 | 'indiana',
17 | 'iowa',
18 | 'kansas',
19 | 'kentucky',
20 | 'louisiana',
21 | 'maine',
22 | 'maryland',
23 | 'massachusetts',
24 | 'michigan',
25 | 'minnesota',
26 | 'mississippi',
27 | 'missouri',
28 | 'montana',
29 | 'nebraska',
30 | 'nevada',
31 | 'new hampshire',
32 | 'new jersey',
33 | 'new mexico',
34 | 'new york state',
35 | 'new york',
36 | 'north carolina',
37 | 'north dakota',
38 | 'ohio',
39 | 'oklahoma',
40 | 'oregon',
41 | 'pennsylvania',
42 | 'rhode island',
43 | 'south carolina',
44 | 'south dakota',
45 | 'tennessee',
46 | 'texas',
47 | 'utah',
48 | 'vermont',
49 | 'virginia',
50 | 'washington dc',
51 | 'washington',
52 | 'west virginia',
53 | 'wisconsin',
54 | 'wyoming',
55 |
56 | //canada
57 | 'alberta',
58 | 'british columbia',
59 | 'manitoba',
60 | 'new brunswick',
61 | 'newfoundland',
62 | 'newfoundland and labrador',
63 | 'nova scotia',
64 | 'nunavut',
65 | 'ontario',
66 | 'prince edward island',
67 | 'pei',
68 | 'quebec',
69 | 'saskatchewan',
70 | 'yukon',
71 |
72 | //australia
73 | 'norfolk',
74 | 'queensland',
75 | 'tasmania',
76 | 'victoria',
77 |
78 | //china
79 | 'qinghai',
80 | 'sichuan',
81 | 'gansu',
82 | 'hunan',
83 | 'guangdong',
84 | 'guizhou',
85 | 'fujian',
86 | 'jiangxi',
87 |
88 | //india
89 | 'rajasthan',
90 | 'madhya',
91 | 'maharashtra',
92 | 'uttar pradesh',
93 | 'kashmir',
94 | 'gujarat',
95 | 'karnataka',
96 | 'manipur',
97 | 'odisha',
98 |
99 | //mexico
100 | 'aguascalientes',
101 | 'baja california',
102 | 'campeche',
103 | 'chiapas',
104 | 'chihuahua',
105 | 'coahuila',
106 | 'colima',
107 | 'durango',
108 | 'guanajuato',
109 | 'guerrero',
110 | 'hidalgo',
111 | 'jalisco',
112 | 'michoacan',
113 | 'morelos',
114 | 'nayarit',
115 | 'nuevo leon',
116 | 'oaxaca',
117 | 'queretaro',
118 | 'quintana roo',
119 | 'san luis potosi',
120 | 'sinaloa',
121 | 'sonora',
122 | 'tabasco',
123 | 'tamaulipas',
124 | 'tlaxcala',
125 | 'veracruz',
126 | 'yucatan',
127 | 'zacatecas',
128 |
129 | //western-europe
130 | 'basque',
131 | 'bavaria',
132 | 'bremen',
133 | 'buckinghamshire',
134 | 'cambridgeshire',
135 | 'corsica',
136 | 'coventry',
137 | 'cumbria',
138 | 'derbyshire',
139 | 'dorset',
140 | 'essex',
141 | 'gloucestershire',
142 | 'hampshire',
143 | 'hertfordshire',
144 | 'lancashire',
145 | 'leeds',
146 | 'leicestershire',
147 | 'lincolnshire',
148 | 'midlands',
149 | 'normandy',
150 | 'north yorkshire',
151 | 'northamptonshire',
152 | 'nottinghamshire',
153 | 'oxfordshire',
154 | 'saxony',
155 | 'sicily',
156 | 'somerset',
157 | 'staffordshire',
158 | 'suffolk',
159 | 'surrey',
160 | 'sussex',
161 | 'tuscany',
162 | 'warwickshire',
163 | 'yorkshire',
164 |
165 | //bangladesh
166 | 'rajshahi',
167 | 'rangpur',
168 | 'khulna',
169 | 'sylhet',
170 |
171 | //brazil
172 | 'minas gerais',
173 | 'bahia',
174 | 'parana',
175 | 'pernambuco',
176 | 'ceara',
177 | 'para',
178 | 'maranhao',
179 | 'santa catarina',
180 |
181 | //misc
182 | 'siberia',
183 | ]
184 |
--------------------------------------------------------------------------------
/plugins/dates/tests/backburner/ambig-month.ignore.js:
--------------------------------------------------------------------------------
1 | import test from 'tape'
2 | import nlp from './_lib.js'
3 | import spacetime from 'spacetime'
4 |
5 | const fmt = (iso) => (iso ? spacetime(iso).format('{iso-short}') : '-')
6 |
7 | test('this month', function (t) {
8 | let arr = [
9 | [2020, 11, 1],
10 | [2020, 11, 8],
11 | [2020, 11, 11],
12 | [2020, 11, 20],
13 | [2020, 11, 25],
14 | [2020, 11, 31],
15 | [2020, 11, 31],
16 | ]
17 | arr.forEach((a) => {
18 | let doc = nlp('this month')
19 | let found = doc.dates({ today: a }).json()[0] || {}
20 | t.equal(fmt((found.dates || {}).start), '2020-12-01', 'this-start')
21 | t.equal(fmt((found.dates || {}).end), '2020-12-31', 'this-end')
22 | })
23 | t.end()
24 | })
25 |
26 | test('next month', function (t) {
27 | let arr = [
28 | [2020, 11, 1],
29 | [2020, 11, 8],
30 | [2020, 11, 11],
31 | [2020, 11, 20],
32 | [2020, 11, 25],
33 | [2020, 11, 31],
34 | [2020, 11, 31],
35 | ]
36 | arr.forEach((a) => {
37 | let doc = nlp('next month')
38 | let found = doc.dates({ today: a }).json()[0] || {}
39 | t.equal(fmt((found.dates || {}).start), '2021-01-01', 'next-start')
40 | t.equal(fmt((found.dates || {}).end), '2021-01-31', 'next-end')
41 | })
42 | t.end()
43 | })
44 |
45 | test('last month', function (t) {
46 | let arr = [
47 | [2020, 11, 1],
48 | [2020, 11, 8],
49 | [2020, 11, 11],
50 | [2020, 11, 20],
51 | [2020, 11, 25],
52 | [2020, 11, 31],
53 | [2020, 11, 31],
54 | ]
55 | arr.forEach((a) => {
56 | let doc = nlp('last month')
57 | let found = doc.dates({ today: a }).json()[0] || {}
58 | t.equal(fmt((found.dates || {}).start), '2020-11-01', 'last-start')
59 | t.equal(fmt((found.dates || {}).end), '2020-11-30', 'last-end')
60 | })
61 | t.end()
62 | })
63 |
64 | test('this december', function (t) {
65 | let arr = [
66 | [2020, 1, 1],
67 | [2020, 2, 8],
68 | [2020, 3, 11],
69 | [2020, 4, 20],
70 | [2020, 5, 25],
71 | [2020, 6, 28],
72 | [2020, 7, 12],
73 | [2020, 8, 12],
74 | [2020, 9, 16],
75 | [2020, 10, 1],
76 | [2020, 11, 11],
77 | ]
78 | arr.forEach((a) => {
79 | let doc = nlp('this december')
80 | let found = doc.dates({ today: a }).json()[0] || {}
81 | t.equal(fmt((found.dates || {}).start), '2020-12-01', 'this december')
82 | t.equal(fmt((found.dates || {}).end), '2020-12-31', 'this december')
83 |
84 | doc = nlp('next december')
85 | found = doc.dates({ today: a }).json()[0] || {}
86 | t.equal(fmt((found.dates || {}).start), '2021-12-01', 'next december')
87 | t.equal(fmt((found.dates || {}).end), '2021-12-31', 'next december')
88 |
89 | doc = nlp('last december')
90 | found = doc.dates({ today: a }).json()[0] || {}
91 | t.equal(fmt((found.dates || {}).start), '2019-12-01', 'last december')
92 | t.equal(fmt((found.dates || {}).end), '2019-12-31', 'last december')
93 | })
94 | t.end()
95 | })
96 |
97 | test('this september', function (t) {
98 | let doc = nlp('this september')
99 | let found = doc.dates({ today: [2019, 7, 4] }).json()[0] || {}
100 | t.equal(fmt((found.dates || {}).start), '2019-09-01', 'this sept - before')
101 |
102 | found = doc.dates({ today: [2019, 8, 4] }).json()[0] || {}
103 | t.equal(fmt((found.dates || {}).start), '2019-09-01', 'this sept - during')
104 |
105 | found = doc.dates({ today: [2019, 9, 4] }).json()[0] || {}
106 | t.equal(fmt((found.dates || {}).start), '2020-09-01', 'this sept - after')
107 | t.end()
108 | })
109 |
--------------------------------------------------------------------------------
/data/lexicon/nouns/sportsTeams.js:
--------------------------------------------------------------------------------
1 | export default [
2 | //mlb
3 | 'arizona diamondbacks',
4 | 'atlanta braves',
5 | 'baltimore orioles',
6 | 'boston red sox',
7 | 'chicago cubs',
8 | 'chicago white sox',
9 | 'cincinnati reds',
10 | 'cleveland indians',
11 | 'colorado rockies',
12 | 'detroit tigers',
13 | 'houston astros',
14 | 'kansas city royals',
15 | 'los angeles dodgers',
16 | 'miami marlins',
17 | 'milwaukee brewers',
18 | 'minnesota twins',
19 | 'new york mets',
20 | 'new york yankees',
21 | 'oakland athletics',
22 | 'philadelphia phillies',
23 | 'pittsburgh pirates',
24 | 'san diego padres',
25 | 'san francisco giants',
26 | 'seattle mariners',
27 | 'st. louis cardinals',
28 | 'tampa bay rays',
29 | 'texas rangers',
30 | 'toronto blue jays',
31 | 'washington nationals',
32 | 'diamondbacks',
33 | 'white sox',
34 | 'astros',
35 | 'dodgers',
36 | 'mets',
37 | 'yankees',
38 | 'phillies',
39 | 'padres',
40 |
41 | //nba
42 | 'boston celtics',
43 | 'brooklyn nets',
44 | 'new york knicks',
45 | 'philadelphia 76ers',
46 | 'toronto raptors',
47 | 'chicago bulls',
48 | 'cleveland cavaliers',
49 | 'detroit pistons',
50 | 'indiana pacers',
51 | 'milwaukee bucks',
52 | 'atlanta hawks',
53 | 'charlotte hornets',
54 | 'miami heat',
55 | 'orlando magic',
56 | 'washington wizards',
57 | 'dallas mavericks',
58 | 'houston rockets',
59 | 'memphis grizzlies',
60 | 'new orleans pelicans',
61 | 'san antonio spurs',
62 | 'denver nuggets',
63 | 'minnesota timberwolves',
64 | 'portland trail blazers',
65 | 'oklahoma city thunder',
66 | 'utah jazz',
67 | 'golden state warriors',
68 | 'los angeles clippers',
69 | 'los angeles lakers',
70 | 'phoenix suns',
71 | 'sacramento kings',
72 | 'knicks',
73 | 'lakers',
74 | 'celtics',
75 |
76 | //nfl
77 | 'arizona cardinals',
78 | 'atlanta falcons',
79 | 'baltimore ravens',
80 | 'buffalo bills',
81 | 'carolina panthers',
82 | 'chicago bears',
83 | 'cincinnati bengals',
84 | 'cleveland browns',
85 | 'dallas cowboys',
86 | 'denver broncos',
87 | 'detroit lions',
88 | 'green bay packers',
89 | 'houston texans',
90 | 'indianapolis colts',
91 | 'jacksonville jaguars',
92 | 'kansas city chiefs',
93 | 'miami dolphins',
94 | 'minnesota vikings',
95 | 'new england patriots',
96 | 'new orleans saints',
97 | 'new york giants',
98 | 'new york jets',
99 | 'oakland raiders',
100 | 'philadelphia eagles',
101 | 'pittsburgh steelers',
102 | 'san diego chargers',
103 | 'san francisco 49ers',
104 | 'seattle seahawks',
105 | 'st. louis rams',
106 | 'tampa bay buccaneers',
107 | 'tennessee titans',
108 | 'washington redskins',
109 |
110 | //mls
111 | 'atlanta united',
112 | 'chicago fire',
113 | 'colorado rapids',
114 | 'columbus crew sc',
115 | 'd.c. united',
116 | 'fc dallas',
117 | 'houston dynamo',
118 | 'la galaxy',
119 | 'minnesota united',
120 | 'montreal impact',
121 | 'new england revolution',
122 | 'new york city fc',
123 | 'new york red bulls',
124 | 'philadelphia union',
125 | 'portland timbers',
126 | 'real salt lake',
127 | 'san jose earthquakes',
128 | 'seattle sounders',
129 | 'sporting kansas city',
130 | 'vancouver whitecaps',
131 | //premier league soccer (mostly city+fc)
132 | 'aston villa',
133 | 'blackburn rovers',
134 | 'cardiff city',
135 | 'leicester city',
136 | 'manchester city',
137 | 'manchester united',
138 | 'newcastle united',
139 | 'queens park rangers',
140 | 'sheffield united',
141 | 'stoke city',
142 | 'tottenham hotspur',
143 | 'west ham united',
144 | ]
145 |
--------------------------------------------------------------------------------
/learn/giga/test.js:
--------------------------------------------------------------------------------
1 | import { forEachSync } from './_giga.js'
2 | import doSentences from './french.js'
3 | import fs from 'fs'
4 | import nlp from '../../src/index.js'
5 |
6 |
7 | let ids = []
8 | for (let i = 1; i <= 10; i += 1) {
9 | let str = String(i).padStart(4, '0')
10 | ids.push(str)
11 | }
12 | ids = ['0004']
13 |
14 | let tagMap = {
15 | 'ABR': 'Acronym',//abbreviation
16 | 'ADJ': 'Adjective',//adjective
17 | 'ADV': 'Adverb',//adjective
18 | 'DET:ART': 'Determiner',//article
19 | 'DET:POS': 'Pronoun',//possessive pronoun (ma, ta, ...)
20 | 'INT': 'Interjection',//interjection
21 | 'KON': 'Conjunction',//conjunction
22 | 'NAM': 'ProperNoun',//proper name
23 | 'NOM': 'Noun',//noun
24 | 'NUM': 'Value',//numeral
25 | 'PRO': 'Pronoun',//pronoun
26 | 'PRO:DEM': 'Pronoun',//demonstrative pronoun
27 | 'PRO:IND': 'Pronoun',//indefinite pronoun
28 | 'PRO:PER': 'Pronoun',//personal pronoun
29 | 'PRO:POS': 'Pronoun',//possessive pronoun (mien, tien, ...)
30 | 'PRO:REL': 'Pronoun',//relative pronoun
31 | 'PRP': 'Preposition',//preposition
32 | 'PRP:det': 'Preposition',//preposition plus article (au,du,aux,des)
33 | // 'PUN':'',//punctuation
34 | // 'PUN:cit':'',//punctuation citation
35 | // 'SENT':'',//sentence tag
36 | // 'SYM':'',//symbol
37 | 'VER:cond': 'Verb',//verb conditional
38 | 'VER:futu': 'Verb',//verb futur
39 | 'VER:impe': 'Verb',//verb imperative
40 | 'VER:impf': 'Verb',//verb imperfect
41 | 'VER:infi': 'Verb',//verb infinitive
42 | 'VER:pper': 'Verb',//verb past participle
43 | 'VER:ppre': 'Verb',//verb present participle
44 | 'VER:pres': 'Verb',//verb present
45 | 'VER:simp': 'Verb',//verb simple past
46 | 'VER:subi': 'Verb',//verb subjunctive imperfect
47 | 'VER:subp': 'Verb',//verb subjunctive present
48 | }
49 |
50 | const ignore = new Set(['au', 'aux', 'des', 'au', 'ne', '$', '.', '(', ')', 'se'])
51 |
52 | let bad = {}
53 |
54 | let right = 0
55 | let wrong = 0
56 | const doBoth = function (both) {
57 | let txt = both.fr.map(o => o['$text']).join(' ')
58 | txt = txt.replace(/ ([.,?):])/g, `$1`)
59 | let correct = {}
60 | both.fr.forEach((term, i) => {
61 | let tag = tagMap[term['$'].pos]
62 | if (tag) {
63 | let str = term['$text'].toLowerCase()
64 | correct[str] = tag
65 | }
66 | })
67 | let doc = nlp(txt)
68 | doc.terms().forEach(t => {
69 | let str = t.text('normal')
70 | let want = correct[str] || null
71 | if (want && !ignore.has(str)) {
72 | if (t.has('#' + want)) {
73 | right += 1
74 | } else {
75 | wrong += 1
76 | bad[str] = bad[str] || 0
77 | bad[str] += 1
78 | // console.log(txt)
79 | // console.log(want)
80 | // t.debug()
81 | }
82 | }
83 | })
84 | }
85 |
86 |
87 | // setInterval(() => {
88 | // let all = Object.entries(bad).sort((a, b) => {
89 | // if (a[1] > b[1]) {
90 | // return -1
91 | // } else if (a[1] < b[1]) {
92 | // return 1
93 | // }
94 | // return 0
95 | // })
96 | // all = all.slice(0, 100)
97 | // console.log(all)
98 | // }, 10000)
99 |
100 | const percent = (part, total) => {
101 | let num = (part / total) * 100;
102 | num = Math.round(num * 10) / 10;
103 | return num;
104 | };
105 |
106 | await forEachSync(ids, async id => {
107 | try {
108 | console.log(`\ndoing ${id}:\n`)
109 | await doSentences(id, doBoth)
110 | console.log(right, ` right ${percent(right, right + wrong)}%`)
111 | } catch (e) {
112 | console.log(e)
113 | }
114 | })
115 | console.log(right, ` right ${percent(right, right + wrong)}%`)
116 | console.log(wrong, ` wrong ${percent(wrong, right + wrong)}%`)
--------------------------------------------------------------------------------
/scripts/types.ts:
--------------------------------------------------------------------------------
1 | // a smoke-test for our typescipt typings
2 | import frCompromise from '../'
3 | import tape from 'tape'
4 | console.log('\n 🥗 - running types-test..\n')
5 |
6 | tape('misc functions', function (t) {
7 | let doc = frCompromise('John and Joe walked to the store')
8 | let m = doc.filter(s => s.found)
9 | let b = doc.map(s => s)
10 | doc.forEach((s) => s)
11 | let o = doc.find(s => s.found)
12 | m = doc.some(s => s.found)
13 | m = doc.random()
14 | m = doc.all()
15 | m = doc.eq(0)
16 | m = doc.first()
17 | m = doc.firstTerms()
18 | m = doc.fullSentences()
19 | m = doc.last()
20 | m = doc.lastTerms()
21 | m = doc.none()
22 | m = doc.slice(0, 1)
23 | m = doc.terms()
24 | m = doc.update([])
25 | m = doc.toView([])
26 | m = doc.fromText('')
27 | m = doc.clone()
28 | let obj = doc.groups()
29 | let arr = doc.termList()
30 | let c = doc.wordCount()
31 | doc.fullPointer
32 | doc.docs
33 | doc.pointer
34 | doc.methods
35 | doc.model
36 | doc.hooks
37 | doc.isView
38 | doc.found
39 | doc.length
40 |
41 | // One
42 | doc.compute('id')
43 | // change
44 | m = doc.toLowerCase()
45 | m = doc.toUpperCase()
46 | m = doc.toTitleCase()
47 | m = doc.toCamelCase()
48 | m = doc.insertAfter('asdf')
49 | m = doc.insertBefore('boo')
50 | m = doc.append('foo')
51 | m = doc.prepend('foo')
52 | m = doc.insert('bar')
53 | m = doc.match('flood').replaceWith('asf')
54 | m = doc.replace('m', 'woo')
55 | m = doc.remove('foo')
56 | m = doc.delete('bar')
57 | m = doc.pre(' ')
58 | m = doc.post(' ')
59 | m = doc.trim()
60 | m = doc.hyphenate()
61 | m = doc.dehyphenate()
62 | m = doc.toQuotations()
63 | m = doc.toParentheses()
64 | m = doc.deHyphenate()
65 | m = doc.toQuotation()
66 | m = doc.unique()
67 | m = doc.reverse()
68 | m = doc.sort()
69 | m = doc.concat(doc.none())
70 | // doc.fork()
71 |
72 | doc.compute('contractions')
73 | doc.compute('lexicon')
74 | doc.lookup(['blue jays', 'farmer'])
75 |
76 | // match
77 | m = doc.matchOne('#Foo')
78 | m = doc.match('#Foo')
79 | let bool = doc.has('#Foo')
80 | m = doc.if('#Foo')
81 | m = doc.ifNo('#Foo')
82 | m = doc.before('#Foo')
83 | m = doc.after('#Foo')
84 | m = doc.growLeft('#Foo')
85 | m = doc.growRight('#Foo')
86 | m = doc.grow('#Foo')
87 | m = doc.splitOn('#Foo')
88 | m = doc.splitBefore('#Foo')
89 | m = doc.splitAfter('#Foo')
90 | m = doc.split('#Foo')
91 |
92 | // output
93 | let res = doc.out()
94 | let txt = doc.text()
95 | txt = doc.text('normal')
96 | txt = doc.text('machine')
97 | txt = doc.text('root')
98 | txt = doc.text('implicit')
99 | txt = doc.json()
100 |
101 | // sets
102 | m = doc.union('blah')
103 | m = doc.and('blah')
104 | m = doc.intersection('blah')
105 | m = doc.difference('blah')
106 | m = doc.not('blah')
107 | m = doc.complement('blah')
108 | m = doc.settle('blah')
109 |
110 | m = doc.tag('Foo')
111 | m = doc.tagSafe('Foo')
112 | m = doc.unTag('Foo')
113 | m = doc.canBe('Foo')
114 |
115 | doc.compute('alias')
116 | doc.compute('normal')
117 | doc.compute('machine')
118 | doc.compute('freq')
119 | doc.compute('offset')
120 | doc.compute('index')
121 | doc.compute('wordCount')
122 |
123 | doc.compute('typeahead')
124 | doc.autoFill()
125 |
126 | // sweep
127 | let matches = [
128 | { match: '2nd quarter of? 2022', tag: 'TimePeriod' },
129 | { match: '(from|by|before) now', tag: 'FooBar' },
130 | ]
131 | let net = frCompromise.buildNet(matches)
132 | doc = frCompromise(`so good by now. woo hoo before now. in the 2nd quarter 2022`)
133 | let sr = doc.sweep(net)
134 |
135 | // lazy
136 | doc = frCompromise.lazy('hello', 'foo')
137 |
138 | t.ok(true)
139 | t.end()
140 | })
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/plugins/dates/src/phrase/date/index.js:
--------------------------------------------------------------------------------
1 | import { months, days } from './data.js'
2 | import { Moment, Month, Day, Week, Year } from './units.js'
3 | import spacetime from 'spacetime'
4 |
5 |
6 |
7 |
8 | // some re-used helper functions:
9 | const parseMonth = function (m) {
10 | let str = m.text('normal')
11 | if (months.hasOwnProperty(str)) {
12 | return months[str] - 1
13 | }
14 | return null
15 | }
16 | const parseNumber = function (m) {
17 | let str = m.text('normal')
18 | str = str.replace(/e$/, '')//ordinal
19 | return parseInt(str, 10)
20 | }
21 |
22 | const isValid = function (cal) {
23 | // if (!cal.month || !cal.date || !cal.year) {
24 | // return false
25 | // }
26 | return true
27 | }
28 |
29 | // pull-apart a spcific date, like 'le 2e oct' independant of a longer phrase
30 | const parseOne = function (m, opts) {
31 | const { today } = opts
32 | // clean it up a little
33 | // m = normalize(m)
34 | // match '2 septembre 1982'
35 | let res = m.match('[#Value] [#Month] [#Year]')
36 | if (res.found) {
37 | let cal = {
38 | month: parseMonth(res.groups('month')),
39 | date: parseNumber(res.groups('date')),
40 | year: parseNumber(res.groups('year')),
41 | }
42 | if (isValid(cal)) {
43 | return new Day(cal, opts)
44 | }
45 | }
46 | // 'oct 2021'
47 | res = m.match('[#Month] [#Year]')
48 | if (res.found) {
49 | let cal = {
50 | month: parseMonth(res.groups('month')),
51 | year: parseNumber(res.groups('year')) || today.year(),
52 | }
53 | if (isValid(cal)) {
54 | return new Month(cal, opts)
55 | }
56 | }
57 | // 'oct 22nd'
58 | res = m.match('[#Month] [#Value] #Year?')
59 | if (res.found) {
60 | let cal = {
61 | month: parseMonth(res.groups('month')),
62 | date: parseNumber(res.groups('date')) || today.date(),
63 | year: parseNumber(res.match('#Year')) || today.year(),
64 | }
65 | if (isValid(cal)) {
66 | return new Day(cal, opts)
67 | }
68 | }
69 | // '6 avril'
70 | res = m.match('[#Value] [#Month] #Year?')
71 | if (res.found) {
72 | let cal = {
73 | // month: parseMonth(res.groups('month')),
74 | // date: parseNumber(res.groups('date')) || today.date(),
75 | month: parseMonth(res.match('#Month')),
76 | date: parseNumber(res.match('#Value')) || today.date(),
77 | year: parseNumber(res.match('#Year')) || today.year(),
78 | }
79 | if (isValid(cal)) {
80 | return new Day(cal, opts)
81 | }
82 | }
83 | // '2021'
84 | res = m.match('[#Year]')
85 | if (res.found) {
86 | let cal = { year: parseNumber(res.groups('year')) }
87 | if (isValid(cal)) {
88 | return new Year(cal, opts)
89 | }
90 | }
91 | // 'octobre'
92 | res = m.match('[#Month]')
93 | if (res.found) {
94 | let cal = { month: parseMonth(res.groups('month')), year: today.year() }
95 | if (isValid(cal)) {
96 | return new Month(cal, opts)
97 | }
98 | }
99 | // '2021-02-12'
100 | res = m.match('#Date+')
101 | if (res.found) {
102 | let s = spacetime(res.text('normal'), opts.timezone, { dmy: true })
103 | if (s.isValid()) {
104 | return new Moment(s, opts)
105 | }
106 | }
107 | // known words
108 | // yesterday
109 | if (m.has('hier')) {
110 | let s = spacetime(null, opts.timezone).minus(1, 'day')
111 | return new Day(s, opts)
112 | }
113 | // tomorrow
114 | if (m.has('demain')) {
115 | let s = spacetime(null, opts.timezone).plus(1, 'day')
116 | return new Day(s, opts)
117 | }
118 | // today
119 | if (m.has('aujourd\'hui')) {
120 | let s = spacetime(null, opts.timezone)
121 | return new Day(s, opts)
122 | }
123 |
124 | // todo: support other forms here! ↓
125 |
126 |
127 | return null
128 | }
129 | export default parseOne
--------------------------------------------------------------------------------
/src/02-two/preTagger/compute/3rd-pass/verb-form.js:
--------------------------------------------------------------------------------
1 | let person = ['FirstPerson', 'SecondPerson', 'ThirdPerson', 'FirstPersonPlural', 'SecondPersonPlural', 'ThirdPersonPlural']
2 |
3 | let whichForm = [
4 | // future
5 | ['ai', 'FirstPerson'],
6 | ['tas', 'SecondPerson'],
7 | ['ta', 'ThirdPerson'],
8 | ['âmes', 'FirstPersonPlural'],
9 | ['âtes', 'SecondPersonPlural'],
10 | ['èrent', 'ThirdPersonPlural'],
11 | // imperfect
12 | ['ait', 'ThirdPerson'],
13 | // futur
14 | ['eras', 'SecondPerson'],
15 | ['eront', 'ThirdPersonPlural'],
16 | // imparfait
17 | ['asse', 'FirstPerson'],
18 | ['asses', 'SecondPerson'],
19 | ['tât', 'ThirdPerson'],
20 | // present
21 | ['es', 'SecondPerson'],
22 | ['ons', 'FirstPersonPlural'],
23 | ['ez', 'SecondPersonPlural'],
24 | ['ent', 'ThirdPersonPlural'],
25 | ]
26 | const pronouns = {
27 | je: 'FirstPerson',
28 | tu: 'SecondPerson',
29 | il: 'ThirdPerson',
30 | elle: 'ThirdPerson',
31 | nous: 'FirstPersonPlural',
32 | vous: 'SecondPersonPlural',
33 | ils: 'ThirdPersonPlural',
34 | }
35 | // can give us a hint to verb person, too
36 | const auxiliaries = {
37 | // etre
38 | suis: 'FirstPerson',
39 | es: 'SecondPerson',
40 | est: 'ThirdPerson',
41 | sommes: 'FirstPersonPlural',
42 | êtes: 'SecondPersonPlural',
43 | sont: 'ThirdPersonPlural',
44 | serai: 'FirstPerson',
45 | seras: 'SecondPerson',
46 | sera: 'ThirdPerson',
47 | serons: 'FirstPersonPlural',
48 | serez: 'SecondPersonPlural',
49 | seront: 'ThirdPersonPlural',
50 | serait: 'ThirdPerson',
51 | serions: 'FirstPersonPlural',
52 | seriez: 'SecondPersonPlural',
53 | seraient: 'ThirdPersonPlural',
54 |
55 | // 'avoir'
56 | ai: 'FirstPerson',
57 | as: 'SecondPerson',
58 | a: 'ThirdPerson',
59 | avons: 'FirstPersonPlural',
60 | avez: 'SecondPersonPlural',
61 | ont: 'ThirdPersonPlural',
62 | // future anterior
63 | aurai: 'FirstPerson',
64 | auras: 'SecondPerson',
65 | aura: 'ThirdPerson',
66 | aurons: 'FirstPersonPlural',
67 | aurez: 'SecondPersonPlural',
68 | auront: 'ThirdPersonPlural',
69 | // Plus-que-parfait
70 | 'avait': 'ThirdPerson',
71 | 'avions': 'FirstPersonPlural',
72 | 'aviez': 'SecondPersonPlural',
73 | 'avaient': 'ThirdPersonPlural',
74 | // conditional avoir
75 | aurait: 'ThirdPerson',
76 | aurions: 'FirstPersonPlural',
77 | auriez: 'SecondPersonPlural',
78 | auraient: 'ThirdPersonPlural',
79 | }
80 |
81 | // guess a tense tag each Verb
82 | const verbForm = function (terms, i, world) {
83 | let setTag = world.methods.one.setTag
84 | let term = terms[i]
85 | let tags = term.tags
86 | if (tags.has('Verb')) {
87 | // console.log(term)
88 | let str = term.implicit || term.normal || term.text || ''
89 | // if we have no person-tag
90 | if (!person.find(s => tags.has(s))) {
91 | // look at the word suffix, for clues
92 | let found = whichForm.find(a => str.endsWith(a[0]))
93 | if (found) {
94 | return setTag([term], found[1], world, false, '3-person-suffix-' + found[1])
95 | }
96 | //look backwards for clues
97 | for (let back = 0; back < 3; back += 1) {
98 | if (!terms[i - back]) {
99 | break
100 | }
101 | let s = terms[i - back].normal
102 | //look backwards for a pronoun
103 | if (terms[i - back].tags.has('Pronoun')) {
104 | if (pronouns.hasOwnProperty(s)) {
105 | return setTag([term], pronouns[s], world, false, '3-person-pronoun-' + s)
106 | }
107 | }
108 | //look backwards for a auxiliary verb - 'sont'
109 | if (terms[i - back].tags.has('Verb')) {
110 | if (auxiliaries.hasOwnProperty(s)) {
111 | return setTag([term], auxiliaries[s], world, false, '3-person-auxiliary-' + s)
112 | }
113 | }
114 | }
115 | }
116 | }
117 | return null
118 | }
119 | export default verbForm
--------------------------------------------------------------------------------
/src/02-two/postTagger/matches.js:
--------------------------------------------------------------------------------
1 | import nounGender from '../preTagger/compute/3rd-pass/noun-gender.js'
2 | import nounPlurals from '../preTagger/compute/3rd-pass/noun-plurals.js'
3 | import adjGender from '../preTagger/compute/3rd-pass/adj-gender.js'
4 | import adjPlurals from '../preTagger/compute/3rd-pass/adj-plurals.js'
5 | import verbTense from '../preTagger/compute/3rd-pass/verb-tense.js'
6 |
7 | const tagNoun = function (m) {
8 | let world = m.world
9 | m.docs.forEach(terms => {
10 | terms.forEach((_t, i) => {
11 | nounGender(terms, i, world)
12 | nounPlurals(terms, i, world)
13 | })
14 | })
15 | }
16 | const tagAdj = function (m) {
17 | let world = m.world
18 | m.docs.forEach(terms => {
19 | terms.forEach((_t, i) => {
20 | adjGender(terms, i, world)
21 | adjPlurals(terms, i, world)
22 | })
23 | })
24 | }
25 | const tagVerb = function (m) {
26 | let world = m.world
27 | m.docs.forEach(terms => {
28 | terms.forEach((_t, i) => {
29 | verbTense(terms, i, world)
30 | })
31 | })
32 | }
33 |
34 | const postTagger = function (doc) {
35 | // ==Nouns==
36 | // l'inconnu
37 | doc.match('(le|un) [#Verb]', 0).tag(['MaleNoun', 'Singular'], 'le-verb')
38 | doc.match('(la|une) [#Verb]', 0).tag(['FemaleNoun', 'Singular'], 'la-verb')
39 | tagNoun(doc.match('(quelques|quelque) [#Verb]', 0).tag('Noun', 'quelque-verb'))
40 | tagNoun(doc.match('(des|les|mes|ces|tes|ses|nos|vos|leurs) [#Verb]', 0).tag('PluralNoun', 'des-verb'))
41 |
42 | // ==Verbs==
43 | // ne foo pas
44 | tagVerb(doc.match('ne [.] pas', 0).tag('Verb', 'ne-verb-pas'))
45 | // il active le
46 | tagVerb(doc.match('il [.] (le|la|les)', 0).tag('Verb', 'il-verb-le'))
47 | // reflexive
48 | tagVerb(doc.match('(se|me|te) [.]', 0).tag('Verb', 'se-noun'))
49 | // Elle interdit les transactions
50 | tagVerb(doc.match('(je|tu|il|elle|nous|vous|ils) [#Adjective] (la|le|les)', 0).tag('Verb', 'ils-x-les'))
51 | // sont interdites par l'interdiction
52 | tagVerb(doc.match('(est|été|sont|était|serait) [#Adjective] #Preposition', 0).tag('Verb', 'song-x-par'))
53 | // a dissimulées
54 | tagVerb(doc.match('(ai|as|a|avons|avez|ont) [#Adjective]', 0).tag('PastTense', 'have-adj'))
55 | // have unpacked
56 | doc.match('(ai|as|a|avons|avez|ont) [#PresentTense]', 0).tag('PastTense', 'have-pres')
57 | // passive voice - est-aimée
58 | doc.match('#Copula #Adverb?+ [#PastParticiple]', 0).tag('Passive', 'passive-voice')
59 |
60 | // ==Adjectives==
61 | // est bien calculée
62 | tagAdj(doc.match('#Copula (bien|très|pas|plus|tant|presque|seulement)+ [#Verb]', 0).tag('Adjective', 'est-bein-calculee'))
63 |
64 | // ==Numbers==
65 | doc.match('#Value et (un|#Value)').tag('TextValue', 'et-un')
66 | doc.match('#Value un').tag('TextValue', 'quatre-vingt-un')
67 | doc.match('moins #Value').tag('TextValue', 'moins-value')
68 |
69 | // ==Dates==
70 | doc.match('[#Value] #Month', 0).tag('Date', 'val-month')
71 | // ambig 'sept'
72 | doc.match('#Month [#Value] #Year', 0).tag('Date', 'mdy')
73 | doc.match('[#Value] #Month #Year', 0).tag('Date', 'dmy')
74 | doc.match('le #Value [sept]', 0).tag('Month', 'val-sept')
75 | doc.match('[sept] #Year', 0).tag('Month', 'sept-year')
76 | doc.match('[sept] (et|ou) #Month', 0).tag('Month', 'sept-et-month')
77 | doc.match('sept$').tag('TextValue', 'sept-alone')
78 | doc.match('et [sept]').tag('TextValue', 'et-sept')
79 | // sept trente
80 | doc.match('sept (dix|vingt|trente|quarante|cinquante|soixante|soixante|#Multiple)').tag('TextValue', 'sept-trente')
81 | doc.match('(dix|vingt|trente|quarante|cinquante|soixante|soixante|#Multiple) sept').tag('TextValue', 'trente-sept')
82 | // // sept-et-jun
83 | // doc.match('#Date [et] #Date', 0).tag('Date', 'date-et-date')
84 | // // courant juin
85 | // doc.match('(en|entre|depuis|courant|pendant|dans|lorsque|avant|après) #Date').tag('Date', 'depuis-date')
86 | // // jusque'en juin
87 | // doc.match('jusque (en|à) #Date').tag('Date', 'jusque-date')
88 | // // au cours de juin
89 | // doc.match('au cours de #Date').tag('Date', 'au-cours-de-date')
90 | }
91 | export default postTagger
--------------------------------------------------------------------------------